Example #1
0
def run(global_config, sample_config):
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
            sample_config)
    sample_config["commands"] = ""
    if "tools" in sample_config:
        """If so, execute them one after the other in the specified order
        (might not work)"""
        for command in sample_config["tools"]:
            """with this I pick up at run time the correct function in the
            current module"""
            command_fn = getattr(sys.modules[__name__],
                    "_run_{}".format(command))
            """Update sample config, each command return sample_config and if
            necessary it modifies it"""
            sample_config = command_fn(global_config, sample_config,
                    sorted_libraries_by_insert)
    else:
        #run default pipeline for QC
        sample_config = _run_trimmomatic(global_config, sample_config,
                sorted_libraries_by_insert)
        sample_config = _run_fastqc(global_config, sample_config,
                sorted_libraries_by_insert)
        sample_config = _run_abyss(global_config, sample_config,
                sorted_libraries_by_insert)
    with open("{}.nougat".format(sample_config.get("output", "sample")), "w") as f:
        yaml.dump(sample_config, f)
Example #2
0
def run(global_config, sample_config):
    sorted_libraries_by_insert = \
            common._sort_libraries_by_insert(sample_config)
    _check_libraries(sorted_libraries_by_insert)

    computeAssemblyStats(sample_config)
    # filter out short contigs
    sample_config = _build_new_reference(sample_config)
    if "tools" in sample_config:
        """If so, execute them one after the other in the specified order \
                (might not work)"""
        for command in sample_config["tools"]:
            """with this I pick up at run time the correct function in the \
                    current module"""
            command_fn = getattr(sys.modules[__name__],
                                 "_run_{}".format(command))
            """Update sample config, each command return sample_config and \
                    if necessary it modifies it"""
            sample_config = command_fn(global_config, sample_config,
                                       sorted_libraries_by_insert)
    else:
        #run default pipeline for QC
        sample_config = _run_align(global_config, sample_config,
                                   sorted_libraries_by_insert)
        sample_config = _run_qaTools(global_config, sample_config,
                                     sorted_libraries_by_insert)
        sample_config = _run_FRC(global_config, sample_config,
                                 sorted_libraries_by_insert)
Example #3
0
def run(global_config, sample_config):
    sorted_libraries_by_insert = \
            common._sort_libraries_by_insert(sample_config)
    _check_libraries(sorted_libraries_by_insert)

    computeAssemblyStats(sample_config)
    # filter out short contigs
    sample_config = _build_new_reference(sample_config)
    if "tools" in sample_config:
        """If so, execute them one after the other in the specified order \
                (might not work)"""
        for command in sample_config["tools"]:
            """with this I pick up at run time the correct function in the \
                    current module"""
            command_fn    = getattr(sys.modules[__name__],
                    "_run_{}".format(command))
            """Update sample config, each command return sample_config and \
                    if necessary it modifies it"""
            sample_config = command_fn(global_config, sample_config,
                    sorted_libraries_by_insert)
    else:
        #run default pipeline for QC
        sample_config = _run_align(global_config, sample_config,
                sorted_libraries_by_insert)
        sample_config = _run_qaTools(global_config, sample_config,
                sorted_libraries_by_insert)
        sample_config = _run_FRC(global_config, sample_config,
                sorted_libraries_by_insert)
Example #4
0
def run(global_config, sample_config):
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    sample_config["commands"] = ""
    if "tools" in sample_config:
        """If so, execute them one after the other in the specified order
        (might not work)"""
        for command in sample_config["tools"]:
            """with this I pick up at run time the correct function in the
            current module"""
            command_fn = getattr(sys.modules[__name__],
                                 "_run_{}".format(command))
            """Update sample config, each command return sample_config and if
            necessary it modifies it"""
            sample_config = command_fn(global_config, sample_config,
                                       sorted_libraries_by_insert)
    else:
        #run default pipeline for QC
        sample_config = _run_trimmomatic(global_config, sample_config,
                                         sorted_libraries_by_insert)
        sample_config = _run_fastqc(global_config, sample_config,
                                    sorted_libraries_by_insert)
        sample_config = _run_abyss(global_config, sample_config,
                                   sorted_libraries_by_insert)
    with open("{}.nougat".format(sample_config.get("output", "sample")),
              "w") as f:
        yaml.dump(sample_config, f)
Example #5
0
def run(global_config, sample_config):
    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
    #Check if the user has specified tools, if not select default list of tools
    if "tools" not in sample_config or len(sample_config["tools"]) == 0:
        sample_config["tools"] = ["soapdenovo"]
    #Execute the commands now
    for command in sample_config["tools"]:
        command_fn = getattr( sys.modules[__name__] ,
                "_run_{}".format(command))
        sample_config = command_fn(global_config, sample_config,
                sorted_libraries_by_insert)
Example #6
0
def run(global_config, sample_config):
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    #Check if the user has specified tools, if not select default list of tools
    if "tools" not in sample_config or len(sample_config["tools"]) == 0:
        sample_config["tools"] = ["soapdenovo"]
    #Execute the commands now
    for command in sample_config["tools"]:
        command_fn = getattr(sys.modules[__name__], "_run_{}".format(command))
        sample_config = command_fn(global_config, sample_config,
                                   sorted_libraries_by_insert)
Example #7
0
def _run_spades(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "spades"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART

    command = ""
    command += "{} ".format(programBIN)
    for option in program_options:
        command += "{} ".format(option)

    #creates the command on-the-fly
    peLibrary = 1
    mpLibrary = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation=="innie" or orientation=="none":
            if read2 is None:
                command += "--pe{}-s {} ".format(peLibrary, read1)
            else:
                command += "--pe{}-1 {} --pe{}-2 {} ".format(peLibrary, read1,
                        peLibrary, read2)
            peLibrary += 1
        elif orientation=="outtie":
            command += "--mp{}-1 {} --mp{}-2 {} ".format(mpLibrary, read1,
                    mpLibrary, read2)
            mpLibrary += 1
        else:
            print("orientation{} not supported.... why the program did not",
                    "failed earlier?".format(orientation))

    command += "-o {} ".format(outputName)
    common.print_command(command)
    returnValue = 0
    if not common.check_dryrun(sample_config):
        assembler_stdOut = open("spades.stdOut", "a")
        assembler_stdErr = open("spades.stdErr", "a")
        returnValue = subprocess.call(command, stdout=assembler_stdOut,
                stderr=assembler_stdErr, shell=True)
    else:
        return sample_config

    flags = sample_config.get("flags", [])
    if returnValue == 0:
        if os.path.exists(os.path.join(outputName,"contigs.fasta")):
            subprocess.call(["cp", os.path.join(outputName,"contigs.fasta"),
                "{}.ctg.fasta".format(outputName)])
            subprocess.call(["cp", os.path.join(outputName,"scaffolds.fasta"),
                "{}.scf.fasta".format(outputName)])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", outputName])
        else:
            print("something wrong with SPADES -> no contig file generated")
    else:
        print("SPADES terminated with an error. Please check running folder",
                "for more informations")

    os.chdir("..")
    return sample_config
Example #8
0
def _run_soapdenovo(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "soapdenovo"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART
    kmer = 54
    if "kmer" in sample_config:
        kmer = sample_config["kmer"]
    threads = ["-p", "8"] # default for UPPMAX
    if "threads" in sample_config:
        threads = ["-p", "{}".format(sample_config["threads"])]
    soap_config_file = open("configuration.txt", "w")
    soap_config_file.write("max_rd_len=150\n")
    #TODO make this a parameter in the options
    rank = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        soap_config_file.write("[LIB]\n")
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        soap_config_file.write("avg_ins={}\n".format(insert))
        soap_config_file.write("rank={}\n".format(rank))
        rank += 1
        soap_config_file.write("map_len=30\n")
        if orientation=="innie" or orientation=="none":
            soap_config_file.write("asm_flags=3\n")
            soap_config_file.write("pair_num_cutoff=3\n")
            soap_config_file.write("reverse_seq=0\n")
            if read2 is None:
                soap_config_file.write("q={}\n".format(read1))
            else:
                soap_config_file.write("q1={}\n".format(read1))
                soap_config_file.write("q2={}\n".format(read2))
        elif orientation=="outtie":
            soap_config_file.write("asm_flags=2\n")
            soap_config_file.write("pair_num_cutoff=5\n")
            soap_config_file.write("reverse_seq=1\n")
            soap_config_file.write("q1={}\n".format(read1))
            soap_config_file.write("q2={}\n".format(read2))

    soap_config_file.close()
    assembler_stdOut = open("soap.stdOut", "w")
    assembler_stdErr = open("soap.stdErr", "w")
    os.makedirs(os.path.join(assemblyDirectory, "runSOAP"))
    os.chdir("runSOAP")
    #TODO : lots of missing options
    command = [programBIN , "all", "-s", "{}".format(os.path.join(assemblyDirectory, "configuration.txt")), "-K",
            "{}".format(kmer), "-L", "500", "-o", "soapAssembly", threads[0],
            threads[1] ]
    common.print_command(command)
    returnValue = 0
    if not common.check_dryrun(sample_config):
        subprocess.call(command, stdout=assembler_stdOut,
                stderr=assembler_stdErr)
    else:
        os.chdir("..")
        os.chdir("..")
        return sample_config

    os.chdir("..")
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        if(os.path.exists(os.path.join("runSOAP","soapAssembly.scafSeq"))):
            subprocess.call(["cp", os.path.join("runSOAP",
                "soapAssembly.scafSeq"), "{}.scf.fasta".format(outputName)])
            subprocess.call(["cp", os.path.join("runSOAP",
                "soapAssembly.contig"), "{}.ctg.fasta".format(outputName)])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runSOAP"])
        else:
            print("something wrong with SOAPdenovo -> no contig file generated")
    else:
        print("SOAPdenovo terminated with an error. Please check running",
                "folder for more informations")
        os.chdir("..")
        return sample_config
    os.chdir("..")
    return sample_config
Example #9
0
def _run_masurca(global_config, sample_config,sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "masurca"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART

    masurca_config_file = open("configuration.txt", "w")
    masurca_config_file.write("DATA\n")
    allTheLetters = string.lowercase
    libraryPE    = "p"
    libraryPEnum = 0
    libraryMP    = "m"
    libraryMPnum = 0
    #TODO: single ended reads
    for library, libraryInfo in sorted_libraries_by_insert:
        read1=libraryInfo["pair1"]
        read2=libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation=="innie":
            if read2 is not None:
                configurationLine = "PE = {}{} {} {} {} {}".format(libraryPE,
                        allTheLetters[libraryPEnum], insert, std, read1, read2)
                masurca_config_file.write("{}\n".format(configurationLine))
                libraryPEnum += 1
                #TODO: check when more than 21 PE libraries ae specified
        elif orientation=="outtie":
            configurationLine = "JUMP = {}{} {} {} {} {}".format(libraryMP, 
                    allTheLetters[libraryMPnum], insert, std, read1, read2)
            masurca_config_file.write("{}\n".format(configurationLine))
            libraryMPnum += 1
            #TODO: check when more than 21 PE libraries ae specified
    masurca_config_file.write("END\n")

    masurca_config_file.write("\n")

    masurca_config_file.write("PARAMETERS\n")
    #this is k-mer size for deBruijn graph values between 25 and 101 are 
    #supported, auto will compute the optimal size based on the read data 
    #and GC content
    masurca_config_file.write("GRAPH_KMER_SIZE=auto\n")
    #set this to 1 for Illumina-only assemblies and to 0 if you have 2x or 
    #more long (Sanger, 454) reads
    masurca_config_file.write("USE_LINKING_MATES=1\n")
    #this parameter is useful if you have too many jumping library mates. 
    #See manual for explanation about settings based on genome length
    if sample_config["genomeSize"] > 10000000:
        masurca_config_file.write("LIMIT_JUMP_COVERAGE = 1000\n")
    else:
        masurca_config_file.write("LIMIT_JUMP_COVERAGE = 60\n")
    #these are the additional parameters to Celera Assembler.  do not worry 
    #about performance, number or processors or batch sizes -- these are 
    #computed automatically. for mammals do not set cgwErrorRate above 0.15!!!
    if sample_config["genomeSize"] > 1500000000:
        masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \
                cgwErrorRate=0.15 ovlMemory=4GB\n")
    else:
        masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \
                cgwErrorRate=0.25 ovlMemory=4GB\n")
    #auto-detected number of cpus to use
    threads = 8 # default for UPPMAX
    if "threads" in sample_config :
        threads = sample_config["threads"]
    masurca_config_file.write("NUM_THREADS= {}\n".format(threads))
    #this is mandatory jellyfish hash size ---- jellyfish hash size, 
    #set this to about 10x the genome size.
    JF_SIZE = sample_config["genomeSize"] * 11
    masurca_config_file.write("JF_SIZE={}\n".format(JF_SIZE))
    #this specifies if we do (1) or do not (0) want to trim long runs of 
    #homopolymers (e.g. GGGGGGGG) from 3' read ends, use it for high GC genomes
    masurca_config_file.write("DO_HOMOPOLYMER_TRIM=0\n")
    masurca_config_file.write("END\n")
    masurca_config_file.write("\n")

    masurca_config_file.close()

    if common.check_dryrun(sample_config):
        os.chdir("..")
        return sample_config

    masurca_stdOut = open("masurca.stdOut", "w")
    masurca_stdErr = open("masurca.stdErr", "w")
    os.mkdir("runMASURCA")
    os.chdir("runMASURCA")
    command = [os.path.join(programBIN,"bin/masurca") , "../configuration.txt"]
    common.print_command(command)

    subprocess.call(command, stdout=masurca_stdOut, stderr=masurca_stdErr)
    if not os.path.exists("assemble.sh"):
        print("MaSuRCA: assemble.sh not created. Unknown failure")
        return sample_config
    command = ["./assemble.sh"]
    common.print_command(command)
    returnValue = subprocess.call(command, stdout=masurca_stdOut,
            stderr=masurca_stdErr)
    os.chdir("..")
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        if os.path.exists(os.path.join(
            "runMASURCA","CA/10-gapclose/genome.scf.fasta")):
            subprocess.call(["cp", os.path.join(
                "runMASURCA","CA/10-gapclose/genome.ctg.fasta"), 
                "{}.ctg.fasta".format(outputName) ])
            subprocess.call(["cp", os.path.join(
                "runMASURCA","CA/10-gapclose/genome.scf.fasta"),
                "{}.scf.fasta".format(outputName) ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runMASURCA"])
        else:
            print("something wrong with MaSuRCA -> no contig file generated")
    else:
        print("MaSuRCA terminated with an error. Please check running folder",
                "for more informations")
        return sample_config
    os.chdir("..")
    return sample_config
Example #10
0
def _run_cabog(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "cabog"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART
    sys.path.insert(0, programBIN)
    libraries = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        command_fastqToCA = os.path.join(programBIN, "fastqToCA")
        read1=libraryInfo["pair1"]
        read2=libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        command_fastqToCA += " -libraryname "
        command_fastqToCA += " {}_{}".format(outputName, libraries)
        command_fastqToCA += " -insertsize "
        command_fastqToCA += " {} {} ".format(insert,std)
        command_fastqToCA += " -technology "
        command_fastqToCA += " illumina "
        command_fastqToCA += " -type "
        command_fastqToCA += " illumina "
        if orientation=="innie" or orientation=="none" :
            command_fastqToCA += " -innie "
            if read2 is None:
                command_fastqToCA += " -reads "
                command_fastqToCA += " {} ".format(read1)
            else:
                command_fastqToCA += " -mates "
                command_fastqToCA += " {},{} ".format(read1, read2)
        elif orientation=="outtie":
            command_fastqToCA += " -outtie "
            command_fastqToCA += " -mates "
            command_fastqToCA += " {},{} ".format(read1, read2)
        command_fastqToCA += " > "
        command_fastqToCA += " {}_{}.frg ".format(outputName, libraries)

        common.print_command(command_fastqToCA)
        if not common.check_dryrun(sample_config):
            cabog_stdOut = open("cabog_fastqToCA.stdOut", "w")
            cabog_stdErr = open("cabogfastqToCA.stdErr", "w")
            subprocess.call(command_fastqToCA, stderr=cabog_stdErr, shell=True)
            cabog_stdOut.close()
            cabog_stdErr.close()
        libraries += 1
    command_runCA = os.path.join(programBIN, "runCA")
    command_runCA += "  -d runCABOGfolder -p {} *frg".format(outputName)
    common.print_command(command_runCA)
    if common.check_dryrun(sample_config):
        return sample_config
    returnValue = 0
    cabog_stdOut = open("cabog_runCA.stdOut", "w")
    cabog_stdErr = open("cabog_runCA.stdErr", "w")
    returnValue = subprocess.call(command_runCA, stdout=cabog_stdOut,
            stderr=cabog_stdErr, shell=True)
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        #assembly succed, remove files and save assembly
        if os.path.exists(os.path.join("runCABOGfolder","9-terminator",
            "{}.ctg.fasta".format(outputName))):
            subprocess.call(["cp", os.path.join("runCABOGfolder","9-terminator",
                "{}.ctg.fasta".format(outputName)),
                "{}.ctg.fasta".format(outputName)])
            subprocess.call(["cp", os.path.join("runCABOGfolder","9-terminator",
                "{}.scf.fasta".format(outputName)),
                "{}.scf.fasta".format(outputName)])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runCABOGfolder"])
        else:
            print("something wrong with CABOG -> no contig file generated")
    else:
        print("CABOG terminated with an error. Please check running folder",
                "for more informations")
    os.chdir("..")
    return sample_config
Example #11
0
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "abyss"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in abyss case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    if _prepare_folder_structure("abyss", assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART
    assembler_stdOut = open("abyss.stdOut", "a")
    assembler_stdErr = open("abyss.stdErr", "a")
    program = os.path.join(programBIN, "abyss-pe")

    command = ""
    command += "{} ".format(program)
    threads = 8  # default for UPPMAX
    if "threads" in sample_config:
        threads = sample_config["threads"]
    command += "np={} ".format(threads)
    kmer = 54
    if "kmer" in sample_config:
        kmer = sample_config["kmer"]
    command += "k={} ".format(kmer)

    libraries = {}
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation == "innie" or orientation == "none":
            if read2 is None:
                # check if this is the first time I insert a se file
                if "se" not in libraries:
                    libraries["se"] = "se=\'"
                libraries["se"] = libraries["se"] + read1
            else:
                if not "lib" in libraries:
                    libraries["lib"] = {}
                libName = insert  # lib name is the insert size
                if not libName in libraries["lib"]:
                    libraries["lib"][libName] = ""
                libraries["lib"][libName] += "{} {} ".format(read1, read2)
        else:
            if not "mp" in libraries:
                libraries["mp"] = {}
            libName = format(insert)
            if not libName in libraries["mp"]:
                libraries["mp"][libName] = ""
            libraries["mp"][libName] += "{} {} ".format(read1, read2)
    #now create the command
    command += "name={} ".format(outputName)
    librariesSE = ""
    librariesPE = ""
    librariesMP = ""
    if "se" in libraries:
        libraries["se"] = libraries["se"] + "\'"
        librariesSE = libraries["se"]
    if "lib" in libraries:
        lib = "lib=\'"
        for libPE, libPEreads in sorted(libraries["lib"].items()):
            lib = lib + "lib{} ".format(libPE)
            librariesPE += " lib{}=\'{}\' ".format(libPE, libPEreads)
        lib = lib + "\' "
        command += "{} ".format(lib)
    if "mp" in libraries:
        mp = "mp=\'"
        for libMP, libMPreads in sorted(libraries["mp"].items()):
            mp = mp + "lib{} ".format(libMP)
            librariesMP += " lib{}=\'{}\' ".format(libMP, libMPreads)
        mp = mp + "\' "
        command += "{} ".format(mp)

    command += "{} ".format(librariesSE)
    command += "{} ".format(librariesPE)
    command += "{} ".format(librariesMP)

    common.print_command(command)
    if common.check_dryrun(sample_config):
        os.chdir("..")
        return sample_config

    os.makedirs(os.path.join(assemblyDirectory, "runABySS"))
    os.chdir("runABySS")
    returnValue = 0
    returnValue = subprocess.call(command,
                                  stdout=assembler_stdOut,
                                  stderr=assembler_stdErr,
                                  shell=True)
    os.chdir("..")
    flags = sample_config.get("flags", [])
    if returnValue == 0 and not common.check_dryrun(sample_config):
        if os.path.exists(
                os.path.join("runABySS", "{}-contigs.fa".format(outputName))):
            subprocess.call([
                "cp",
                os.path.join("runABySS", "{}-contigs.fa".format(outputName)),
                "{}.ctg.fasta".format(outputName)
            ])
            subprocess.call([
                "cp",
                os.path.join("runABySS", "{}-scaffolds.fa".format(outputName)),
                "{}.scf.fasta".format(outputName)
            ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runABySS"])
        elif not common.check_dryrun(sample_config):
            print("something wrong with ABySS -> no contig file generated")
            return sample_config
    else:
        print("ABySS terminated with an error. Please check running folder",
              "for more informations")
    os.chdir("..")
    return sample_config
Example #12
0
def _run_qc_report(global_config, sample_config):
    """This function produces a pdf report and stores the important \
            resutls in a single folder"""

    sorted_libraries_by_insert = common._sort_libraries_by_insert(
            sample_config)
    ### retrive all info needed to write the report
    sampleName = "sample"
    if "output" in sample_config:
        sampleName = sample_config["output"]
    projectName = "anonymous_project"
    if "projectName" in sample_config:
        projectName = sample_config["projectName"]

    currentDir  = os.getcwd()
    workingDir  = os.path.join(currentDir, "results")
    if not os.path.exists(workingDir):
        os.makedirs(workingDir)
    os.chdir(workingDir)

    reportDir   = os.path.join(workingDir, "report")
    if not os.path.exists(reportDir):
        os.makedirs(reportDir)

    PDFtitle = os.path.join(workingDir, "report",
            "{}.pdf".format(sample_config["output"]))

    # this you cannot do in rLab which is why I wrote the helper initially
    TABLE_WIDTH = 540
    class MyTheme(DefaultTheme):
        doc = {
            'leftMargin': 25,
            'rightMargin': 25,
            'topMargin': 20,
            'bottomMargin': 25,
            'allowSplitting': False
            }
    # let's create the doc and specify title and author
    doc = pdf.Pdf('{} {}'.format(projectName, sampleName),
            'NGI-Stockholm, Science for Life Laboratory')

    # now we apply our theme
    doc.set_theme(MyTheme)
    # give me some space
    doc.add_spacer()
    # this header defaults to H1
    scriptDirectory = os.path.split(os.path.abspath(__file__))[0]
    logo_path = os.path.join(scriptDirectory, '../pictures/ngi_scilife.png')
    doc.add_image(logo_path, 540, 50, pdf.CENTER)
    # give me some space
    doc.add_spacer()

    doc.add_header('NGI-Stockholm -- Science For Life Laboratory')
    doc.add_header('Best-practice analysis for quality checking report')
    doc.add_header('{} -- {}'.format(projectName, sampleName))
    # give me some space
    doc.add_spacer()
    doc.add_paragraph("For sample {} belonging to the project {} "
            "NGI-Stockholm best-practice analysis for quality checking has "
            "been performed. For mate pair libraries produced with Nextera, "
            "best-practice analysis described at this address has been "
            "performed: http://res.illumina.com/documents/products/technotes/"
            "technote_nextera_matepair_data_processing.pdf".format(sampleName,
            projectName))
    doc.add_spacer()
    tools = ["trimmomatic", "fastqc", "abyss", "align", "kmergenie"]
    if "tools" in sample_config and len(sample_config["tools"]) > 0:
        tools = sample_config["tools"]
    doc.add_paragraph("The following tools have been employed \
            (tools are listed in order of execution):")
    bollet_list = []
    for tool in tools:
        if tool != "align":
            program_path = global_config["Tools"][tool]["bin"]
            bollet_list.append("{} : {}".format(tool, program_path))
        else:
            bollet_list.append("{} : {}".format(tool, \
                    global_config["Tools"]["bwa"]["bin"]))
            bollet_list.append("{} : {}".format(tool, \
                    global_config["Tools"]["samtools"]["bin"]))
            bollet_list.append("{} : {}".format(tool, \
                    global_config["Tools"]["picard"]["bin"]))
    doc.add_list(bollet_list)
    doc.add_spacer()
    doc.add_paragraph("The results from each tool is reported in the "
            "following sections. Moreover you will find all the results and "
            "commands that have been run in the delivery folder on Uppmax")

    for tool in tools:
        doc.add_pagebreak()
        doc.add_header(tool.title() , pdf.H2)
        if tool  == "trimmomatic":
            doc.add_paragraph("Reads (both paired and mate pairs) can "
                    "contain parts of the adapter sequence or, in the case of "
                    "mate pairs, part of the linker sequence. Illumina "
                    "recommends to remove the adapter before use of the reads "
                    "in any downstream analysis (this is mandatory for mate "
                    "pairs).")
            doc.add_paragraph("Adapter sequences removed are:")
            adapter_file = sample_config["adapters"]
            adapters     = []
            with open(adapter_file) as afile:
                lines       = afile.readlines()
                for index in range(1, len(lines), 2):
                    adapters.append(lines[index].rstrip())
            doc.add_list(adapters)
            doc.add_spacer()

            trimmomatic_table_part1 = [[sampleName, "#orig_pairs",
                    "#survived_pairs"]] # this is the header row
            trimmomatic_table_part2 = [[sampleName,"#survived_fw_only",
                    "#survived_rv_only", "#discarded"]]

            total_orig_pairs = 0
            total_survived_pairs = 0
            total_survived_fw_only = 0
            total_survived_rv_only = 0
            total_discarded = 0

            for library, libraryInfo in sorted_libraries_by_insert:
                runName = os.path.basename(libraryInfo["trimmomatic"]).split(
                        "_1_trimmomatic.stdErr")[0]
                with open(libraryInfo["trimmomatic"]) as trimmomatic_output:
                    lines       = trimmomatic_output.readlines()
                    result_line = lines[-2].rstrip()
                    match_string = re.compile("Input Read Pairs: (\d+) Both "
                            "Surviving: (\d+) \(.+\) Forward Only Surviving: "
                            "(\d+) \(.+\) Reverse Only Surviving: (\d+) \(.+\) "
                            "Dropped: (\d+) \(.+\)")
                    read_pairs = int(match_string.match(result_line).group(1))
                    survived_pairs = int(match_string.match(
                        result_line).group(2))
                    survived_fw_only = int(match_string.match(
                        result_line).group(3))
                    survived_rv_only = int(match_string.match(
                        result_line).group(4))
                    discarded        = int(match_string.match(
                        result_line).group(5))
                    read_pairs_perc =  "({0:.0f}%)".format(
                            (float(survived_pairs)/read_pairs) * 100)
                    survived_fw_only_perc = "({0:.0f}%)".format(
                            (float(survived_fw_only)/read_pairs) * 100)
                    survived_rv_only_perc = "({0:.0f}%)".format(
                            (float(survived_rv_only)/read_pairs) * 100)
                    survived_discarded_perc = "({0:.0f}%)".format(
                            (float(discarded)/read_pairs) * 100)

                    total_orig_pairs += read_pairs
                    total_survived_pairs += survived_pairs
                    total_survived_fw_only += survived_fw_only
                    total_survived_rv_only += survived_rv_only
                    total_discarded += discarded
                # these are the other rows
                trimmomatic_table_part1.append([runName,read_pairs,
                    "{} {}".format(survived_pairs, read_pairs_perc)])
                trimmomatic_table_part2.append([runName,
                    "{} {}".format(survived_fw_only, survived_fw_only_perc),
                    "{} {}".format(survived_rv_only, survived_rv_only_perc),
                    "{} {}".format(discarded, survived_discarded_perc)])
            survived_pairs_perc =  "({0:.0f}%)".format(
                    (float(total_survived_pairs)/total_orig_pairs) * 100)
            survived_survived_fw_only_perc =  "({0:.0f}%)".format(
                    (float(total_survived_fw_only)/total_orig_pairs) * 100)
            survived_survived_rv_only_perc =  "({0:.0f}%)".format(
                    (float(total_survived_rv_only)/total_orig_pairs) * 100)
            survived_discarded_perc =  "({0:.0f}%)".format(
                    (float(total_discarded)/total_orig_pairs) * 100)
            trimmomatic_table_part1.append(["total", total_orig_pairs,
                "{} {}".format(total_survived_pairs, survived_pairs_perc)])
            # last row is the sum
            trimmomatic_table_part2.append(["total", "{} {}".format(
                survived_fw_only, survived_fw_only_perc),
                "{} {}".format(survived_rv_only, survived_rv_only_perc),
                "{} {}".format(discarded, survived_discarded_perc)])
            doc.add_table(trimmomatic_table_part1, TABLE_WIDTH)
            doc.add_spacer()
            doc.add_table(trimmomatic_table_part2, TABLE_WIDTH)
            ##now save the trimmed reads
            trimmomaticDir = os.path.split(libraryInfo["trimmomatic"])[0]
            trimmomaticResultDir  = os.path.join(workingDir, "fastq_trimmed")
            if not os.path.exists(trimmomaticResultDir):
                os.makedirs(trimmomaticResultDir)
            filesToCopy = [os.path.join(trimmomaticDir, f) for f in \
                    os.listdir(trimmomaticDir) \
                    if (os.path.isfile(os.path.join(trimmomaticDir,f)) \
                    and re.search('.gz$',f))]
            for source in filesToCopy:
                dest = os.path.join("fastq_trimmed" , os.path.split(source)[1])
                if not os.path.isfile(dest):
                    shutil.copyfile(source, dest)

        if tool == "fastqc" and "fastqc" in sample_config:
            fastqc_dir = sample_config["fastqc"]
            for fastqc_run in [dir for dir in os.listdir(fastqc_dir) \
                    if os.path.isdir(os.path.join(fastqc_dir, dir))]:
                fastqc_run_dir = os.path.join(fastqc_dir, fastqc_run, "Images")
                doc.add_image(os.path.join(fastqc_run_dir,
                    "per_base_quality.png"), 400, 180, pdf.CENTER,
                    "{} -- Per Base Quality".format(fastqc_run))
                fastqc_run_dir = os.path.join(fastqc_dir, fastqc_run, "Images")
                doc.add_image(os.path.join(fastqc_run_dir,
                    "sequence_length_distribution.png"), 400, 180, pdf.CENTER,
                    "{} -- Sequence Length Distribution".format(fastqc_run))
            #If I have not yet copied fastqc results do it
            if not os.path.exists("fastqc"):
                dirsToBeCopied = [os.path.join(fastqc_dir, f) for f in \
                        os.listdir(fastqc_dir) \
                        if os.path.isdir(os.path.join(fastqc_dir, f))]
                for source in dirsToBeCopied:
                    dest = os.path.join("fastqc", os.path.split(source)[1])
                    if not os.path.exists(dest):
                        shutil.copytree(source, dest)

        if tool == "abyss" and "abyss" in sample_config:
            doc.add_paragraph("A possible way to assess the complexity of a "
                    "library even in absence of a reference sequence is to "
                    "look at the kmer profile of the reads. The idea is to "
                    "count all the kmers (i.e., sequence of length k) that occur "
                    "in the reads. In this way it is possible to know how many "
                    "kmers occur 1,2,..., N times and represent this as a "
                    "plot. This plot tell us for each x, how many k-mers "
                    "(y-axis) are present in the dataset in exactly x-copies. "
                    "In an ideal world (no errors in sequencing, no bias, no "
                    "repeating regions) this plot should be as close as "
                    "possible to a gaussian distribution. In reality we will "
                    "always see a peak for x=1 (i.e., the errors) and another "
                    "peak close to the expected coverage. If the genome is "
                    "highly heterozygous a second peak at half of the coverage "
                    "can be expected.")
            kmer_1_200 = os.path.join(sample_config["abyss"],
                    "kmer_coverage.png")
            doc.add_image(kmer_1_200, 500, 300, pdf.CENTER,
                    "kmer profile with k={}.".format(sample_config["kmer"]))
            #copy the results in resutls
            if not os.path.exists("kmer_analysis"):
                os.mkdir("kmer_analysis")
            kmerDir = sample_config["abyss"]
            filesToCopy = [os.path.join(kmerDir, f) for f in \
                    os.listdir(kmerDir) \
                    if (os.path.isfile(os.path.join(kmerDir,f)) \
                    and re.search('.png$',f))]
            filesToCopy.append(os.path.join(kmerDir, "histogram.hist"))
            for source in filesToCopy:
                dest = os.path.join("kmer_analysis", os.path.split(source)[1])
                if not os.path.exists(dest):
                    shutil.copyfile(source, dest)

        if tool == "align" and "alignments" in sample_config:
            alignments = sample_config["alignments"][0]
            alignment_path = alignments[1]
            alignment_prefix = alignments[2]
            align_dir = os.path.split(alignment_path)[0]
            doc.add_header("{} -- Collect Insert Size Metrics".format(
                sampleName) , pdf.H3)
            with open(os.path.join(align_dir,
                "{}.collectInsertSize.txt".format(alignment_prefix))) \
                as collectInsertSize:
                    lines  = collectInsertSize.readlines()
                    line  = lines[6].rstrip().split("\t")
                    # this is the header row
                    insertSize_table = [[line[7], line[6], line[4], line[5]]]
                    line  = lines[7].rstrip().split("\t")
                     # this is the header row
                    insertSize_table.append([line[7], line[6], line[4],
                        line[5]])
                    line  = lines[8].rstrip().split("\t")
                     # this is the header row
                    insertSize_table.append([line[7], line[6], line[4],
                        line[5]])
                    line  = lines[9].rstrip().split("\t")
                     # this is the header row
                    insertSize_table.append([line[7], line[6], line[4],
                        line[5]])
                    doc.add_table(insertSize_table, TABLE_WIDTH)
            doc.add_spacer()
            full_path_to_pdf =  os.path.join(align_dir,
                    "{}.collectInsertSize.pdf".format(alignment_prefix))
            doc.add_paragraph("Insert size plot can be found in the result \
                    directory: {}".format(os.path.join("alignments",
                    "{}.collectInsertSize.pdf".format(alignment_prefix))))
            doc.add_spacer()
            doc.add_header("{} -- Duplicate Metrics".format(sampleName),
                    pdf.H3)
            with open(os.path.join(align_dir,
                "{}.markDuplicates.txt".format(alignment_prefix))) as \
                collectInsertSize:
                    lines  = collectInsertSize.readlines()
                    line  = lines[6].rstrip().split("\t")
                    # this is the header row
                    duplication_table_part1 = [line[0:3]]
                    duplication_table_part2 = [line[4:6]]
                    duplication_table_part3 = [line[7:9]]
                    line  = lines[7].rstrip().split("\t")
                    duplication_table_part1.append(line[0:3])
                    duplication_table_part2.append(line[4:6])
                    duplication_table_part3.append(line[7:9])
            doc.add_table(duplication_table_part1, TABLE_WIDTH)
            doc.add_spacer()
            doc.add_table(duplication_table_part2, TABLE_WIDTH)
            doc.add_spacer()
            doc.add_table(duplication_table_part3, TABLE_WIDTH)
            doc.add_spacer()
            full_path_to_bam =  os.path.join(align_dir,
                    "{}_noDup.bam".format(alignment_prefix))
            doc.add_paragraph("Bam file with marked duplicate reads can be \
                    found at: {}".format(os.path.join("alignments",
                    "{}_noDup.bam".format(alignment_prefix))))
            doc.add_spacer()
            #copy the results in resutls
            if not os.path.exists("alignments"):
                os.mkdir("alignments")
            filesToCopy = [os.path.join(align_dir, f) for f in \
                    os.listdir(align_dir) \
                    if (os.path.isfile(os.path.join(align_dir,f)) \
                    and re.search('{}'.format(alignment_prefix),f))]
            for source in filesToCopy:
                dest = os.path.join("alignments", os.path.split(source)[1])
                if not os.path.exists(dest):
                    shutil.copyfile(source, dest)

        if tool == "kmergenie" and "kmergenie" in sample_config:
            doc.add_paragraph("Assemblers using a de Bruijn graph strategy "
                    "for contig construction (such as Velvet, ABySS and "
                    "SOAPdenovo) fractures the reads into k-sized substrings "
                    "(k-mers). The k-mer size is vital for the performance of "
                    "these assemblers, and is usually selected considering "
                    "several trade-offs between the size and accuracy of the "
                    "produced contigs. Some assemblers choose the k-mer size "
                    "automatically or builds several assemblies (using "
                    "different k-mers) and / or relies on user input. "
                    "Kmergenie is a lightweight program that suggests a best "
                    "k-mer size based on their relative abundance in the "
                    "genomic reads.")
            kmerdir = sample_config["kmergenie"]
            doc.add_image(os.path.join(kmerdir,"histograms.dat.png"), 400, 300, 
                    pdf.CENTER, ("The plot should be roughly concave and have "
                            "a clear global maximum, if not the predicted best "
                            "k is likely to be inaccurate"))
            #copy everything to results
            dest = os.path.join(os.getcwd(), "kmergenie")
            if not os.path.exists(dest):
                shutil.copytree(kmerdir, dest)
    doc.render(PDFtitle)
    
    # Copy the pipeline files and commands run to the report directory
    filesToCopy = glob.glob(currentDir+"/{}_QCcontrol.*".format(sampleName))
    for cfile in filesToCopy:
        shutil.copyfile(cfile, os.path.join(reportDir, os.path.basename(cfile)))

    with open(os.path.join(reportDir, "commands.txt"), "w") as f:
        f.write(sample_config.get("commands", ""))

    os.chdir(currentDir)
Example #13
0
def _run_spades(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "spades"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART

    command = ""
    command += "{} ".format(programBIN)
    for option in program_options:
        command += "{} ".format(option)

    #creates the command on-the-fly
    peLibrary = 1
    mpLibrary = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation == "innie" or orientation == "none":
            if read2 is None:
                command += "--pe{}-s {} ".format(peLibrary, read1)
            else:
                command += "--pe{}-1 {} --pe{}-2 {} ".format(
                    peLibrary, read1, peLibrary, read2)
            peLibrary += 1
        elif orientation == "outtie":
            command += "--mp{}-1 {} --mp{}-2 {} ".format(
                mpLibrary, read1, mpLibrary, read2)
            mpLibrary += 1
        else:
            print("orientation{} not supported.... why the program did not",
                  "failed earlier?".format(orientation))

    command += "-o {} ".format(outputName)
    common.print_command(command)
    returnValue = 0
    if not common.check_dryrun(sample_config):
        assembler_stdOut = open("spades.stdOut", "a")
        assembler_stdErr = open("spades.stdErr", "a")
        returnValue = subprocess.call(command,
                                      stdout=assembler_stdOut,
                                      stderr=assembler_stdErr,
                                      shell=True)
    else:
        return sample_config

    flags = sample_config.get("flags", [])
    if returnValue == 0:
        if os.path.exists(os.path.join(outputName, "contigs.fasta")):
            subprocess.call([
                "cp",
                os.path.join(outputName, "contigs.fasta"),
                "{}.ctg.fasta".format(outputName)
            ])
            subprocess.call([
                "cp",
                os.path.join(outputName, "scaffolds.fasta"),
                "{}.scf.fasta".format(outputName)
            ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", outputName])
        else:
            print("something wrong with SPADES -> no contig file generated")
    else:
        print("SPADES terminated with an error. Please check running folder",
              "for more informations")

    os.chdir("..")
    return sample_config
Example #14
0
def _run_soapdenovo(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "soapdenovo"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART
    kmer = 54
    if "kmer" in sample_config:
        kmer = sample_config["kmer"]
    threads = ["-p", "8"]  # default for UPPMAX
    if "threads" in sample_config:
        threads = ["-p", "{}".format(sample_config["threads"])]
    soap_config_file = open("configuration.txt", "w")
    soap_config_file.write("max_rd_len=150\n")
    #TODO make this a parameter in the options
    rank = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        soap_config_file.write("[LIB]\n")
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        soap_config_file.write("avg_ins={}\n".format(insert))
        soap_config_file.write("rank={}\n".format(rank))
        rank += 1
        soap_config_file.write("map_len=30\n")
        if orientation == "innie" or orientation == "none":
            soap_config_file.write("asm_flags=3\n")
            soap_config_file.write("pair_num_cutoff=3\n")
            soap_config_file.write("reverse_seq=0\n")
            if read2 is None:
                soap_config_file.write("q={}\n".format(read1))
            else:
                soap_config_file.write("q1={}\n".format(read1))
                soap_config_file.write("q2={}\n".format(read2))
        elif orientation == "outtie":
            soap_config_file.write("asm_flags=2\n")
            soap_config_file.write("pair_num_cutoff=5\n")
            soap_config_file.write("reverse_seq=1\n")
            soap_config_file.write("q1={}\n".format(read1))
            soap_config_file.write("q2={}\n".format(read2))

    soap_config_file.close()
    assembler_stdOut = open("soap.stdOut", "w")
    assembler_stdErr = open("soap.stdErr", "w")
    os.makedirs(os.path.join(assemblyDirectory, "runSOAP"))
    os.chdir("runSOAP")
    #TODO : lots of missing options
    command = [
        programBIN, "all", "-s",
        "{}".format(os.path.join(assemblyDirectory, "configuration.txt")),
        "-K", "{}".format(kmer), "-L", "500", "-o", "soapAssembly", threads[0],
        threads[1]
    ]
    common.print_command(command)
    returnValue = 0
    if not common.check_dryrun(sample_config):
        subprocess.call(command,
                        stdout=assembler_stdOut,
                        stderr=assembler_stdErr)
    else:
        os.chdir("..")
        os.chdir("..")
        return sample_config

    os.chdir("..")
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        if (os.path.exists(os.path.join("runSOAP", "soapAssembly.scafSeq"))):
            subprocess.call([
                "cp",
                os.path.join("runSOAP", "soapAssembly.scafSeq"),
                "{}.scf.fasta".format(outputName)
            ])
            subprocess.call([
                "cp",
                os.path.join("runSOAP", "soapAssembly.contig"),
                "{}.ctg.fasta".format(outputName)
            ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runSOAP"])
        else:
            print(
                "something wrong with SOAPdenovo -> no contig file generated")
    else:
        print("SOAPdenovo terminated with an error. Please check running",
              "folder for more informations")
        os.chdir("..")
        return sample_config
    os.chdir("..")
    return sample_config
Example #15
0
def _run_masurca(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "masurca"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART

    masurca_config_file = open("configuration.txt", "w")
    masurca_config_file.write("DATA\n")
    allTheLetters = string.lowercase
    libraryPE = "p"
    libraryPEnum = 0
    libraryMP = "m"
    libraryMPnum = 0
    #TODO: single ended reads
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation == "innie":
            if read2 is not None:
                configurationLine = "PE = {}{} {} {} {} {}".format(
                    libraryPE, allTheLetters[libraryPEnum], insert, std, read1,
                    read2)
                masurca_config_file.write("{}\n".format(configurationLine))
                libraryPEnum += 1
                #TODO: check when more than 21 PE libraries ae specified
        elif orientation == "outtie":
            configurationLine = "JUMP = {}{} {} {} {} {}".format(
                libraryMP, allTheLetters[libraryMPnum], insert, std, read1,
                read2)
            masurca_config_file.write("{}\n".format(configurationLine))
            libraryMPnum += 1
            #TODO: check when more than 21 PE libraries ae specified
    masurca_config_file.write("END\n")

    masurca_config_file.write("\n")

    masurca_config_file.write("PARAMETERS\n")
    #this is k-mer size for deBruijn graph values between 25 and 101 are
    #supported, auto will compute the optimal size based on the read data
    #and GC content
    masurca_config_file.write("GRAPH_KMER_SIZE=auto\n")
    #set this to 1 for Illumina-only assemblies and to 0 if you have 2x or
    #more long (Sanger, 454) reads
    masurca_config_file.write("USE_LINKING_MATES=1\n")
    #this parameter is useful if you have too many jumping library mates.
    #See manual for explanation about settings based on genome length
    if sample_config["genomeSize"] > 10000000:
        masurca_config_file.write("LIMIT_JUMP_COVERAGE = 1000\n")
    else:
        masurca_config_file.write("LIMIT_JUMP_COVERAGE = 60\n")
    #these are the additional parameters to Celera Assembler.  do not worry
    #about performance, number or processors or batch sizes -- these are
    #computed automatically. for mammals do not set cgwErrorRate above 0.15!!!
    if sample_config["genomeSize"] > 1500000000:
        masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \
                cgwErrorRate=0.15 ovlMemory=4GB\n")
    else:
        masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \
                cgwErrorRate=0.25 ovlMemory=4GB\n")
    #auto-detected number of cpus to use
    threads = 8  # default for UPPMAX
    if "threads" in sample_config:
        threads = sample_config["threads"]
    masurca_config_file.write("NUM_THREADS= {}\n".format(threads))
    #this is mandatory jellyfish hash size ---- jellyfish hash size,
    #set this to about 10x the genome size.
    JF_SIZE = sample_config["genomeSize"] * 11
    masurca_config_file.write("JF_SIZE={}\n".format(JF_SIZE))
    #this specifies if we do (1) or do not (0) want to trim long runs of
    #homopolymers (e.g. GGGGGGGG) from 3' read ends, use it for high GC genomes
    masurca_config_file.write("DO_HOMOPOLYMER_TRIM=0\n")
    masurca_config_file.write("END\n")
    masurca_config_file.write("\n")

    masurca_config_file.close()

    if common.check_dryrun(sample_config):
        os.chdir("..")
        return sample_config

    masurca_stdOut = open("masurca.stdOut", "w")
    masurca_stdErr = open("masurca.stdErr", "w")
    os.mkdir("runMASURCA")
    os.chdir("runMASURCA")
    command = [os.path.join(programBIN, "bin/masurca"), "../configuration.txt"]
    common.print_command(command)

    subprocess.call(command, stdout=masurca_stdOut, stderr=masurca_stdErr)
    if not os.path.exists("assemble.sh"):
        print("MaSuRCA: assemble.sh not created. Unknown failure")
        return sample_config
    command = ["./assemble.sh"]
    common.print_command(command)
    returnValue = subprocess.call(command,
                                  stdout=masurca_stdOut,
                                  stderr=masurca_stdErr)
    os.chdir("..")
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        if os.path.exists(
                os.path.join("runMASURCA", "CA/10-gapclose/genome.scf.fasta")):
            subprocess.call([
                "cp",
                os.path.join("runMASURCA", "CA/10-gapclose/genome.ctg.fasta"),
                "{}.ctg.fasta".format(outputName)
            ])
            subprocess.call([
                "cp",
                os.path.join("runMASURCA", "CA/10-gapclose/genome.scf.fasta"),
                "{}.scf.fasta".format(outputName)
            ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runMASURCA"])
        else:
            print("something wrong with MaSuRCA -> no contig file generated")
    else:
        print("MaSuRCA terminated with an error. Please check running folder",
              "for more informations")
        return sample_config
    os.chdir("..")
    return sample_config
Example #16
0
def _run_cabog(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "cabog"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART
    sys.path.insert(0, programBIN)
    libraries = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        command_fastqToCA = os.path.join(programBIN, "fastqToCA")
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        command_fastqToCA += " -libraryname "
        command_fastqToCA += " {}_{}".format(outputName, libraries)
        command_fastqToCA += " -insertsize "
        command_fastqToCA += " {} {} ".format(insert, std)
        command_fastqToCA += " -technology "
        command_fastqToCA += " illumina "
        command_fastqToCA += " -type "
        command_fastqToCA += " illumina "
        if orientation == "innie" or orientation == "none":
            command_fastqToCA += " -innie "
            if read2 is None:
                command_fastqToCA += " -reads "
                command_fastqToCA += " {} ".format(read1)
            else:
                command_fastqToCA += " -mates "
                command_fastqToCA += " {},{} ".format(read1, read2)
        elif orientation == "outtie":
            command_fastqToCA += " -outtie "
            command_fastqToCA += " -mates "
            command_fastqToCA += " {},{} ".format(read1, read2)
        command_fastqToCA += " > "
        command_fastqToCA += " {}_{}.frg ".format(outputName, libraries)

        common.print_command(command_fastqToCA)
        if not common.check_dryrun(sample_config):
            cabog_stdOut = open("cabog_fastqToCA.stdOut", "w")
            cabog_stdErr = open("cabogfastqToCA.stdErr", "w")
            subprocess.call(command_fastqToCA, stderr=cabog_stdErr, shell=True)
            cabog_stdOut.close()
            cabog_stdErr.close()
        libraries += 1
    command_runCA = os.path.join(programBIN, "runCA")
    command_runCA += "  -d runCABOGfolder -p {} *frg".format(outputName)
    common.print_command(command_runCA)
    if common.check_dryrun(sample_config):
        return sample_config
    returnValue = 0
    cabog_stdOut = open("cabog_runCA.stdOut", "w")
    cabog_stdErr = open("cabog_runCA.stdErr", "w")
    returnValue = subprocess.call(command_runCA,
                                  stdout=cabog_stdOut,
                                  stderr=cabog_stdErr,
                                  shell=True)
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        #assembly succed, remove files and save assembly
        if os.path.exists(
                os.path.join("runCABOGfolder", "9-terminator",
                             "{}.ctg.fasta".format(outputName))):
            subprocess.call([
                "cp",
                os.path.join("runCABOGfolder", "9-terminator",
                             "{}.ctg.fasta".format(outputName)),
                "{}.ctg.fasta".format(outputName)
            ])
            subprocess.call([
                "cp",
                os.path.join("runCABOGfolder", "9-terminator",
                             "{}.scf.fasta".format(outputName)),
                "{}.scf.fasta".format(outputName)
            ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runCABOGfolder"])
        else:
            print("something wrong with CABOG -> no contig file generated")
    else:
        print("CABOG terminated with an error. Please check running folder",
              "for more informations")
    os.chdir("..")
    return sample_config
Example #17
0
def _run_allpaths(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler                  = "allpaths"
    outputName                 = sample_config["output"]
    currentDirectory           = os.getcwd()
    assemblyDirectory          = os.path.join(currentDirectory, assembler)
    # in abyss case there is no exectuable
    programBIN                 = global_config["Tools"][assembler]["bin"]
    program_options            = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
    if _prepare_folder_structure("allpaths", assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    inGroups_file = open("in_groups.csv", "w")
    inLibs_file   = open("in_libs.csv", "w")
    inGroups_file.write("group_name, library_name, file_name\n")
    inLibs_file.write("library_name, project_name, organism_name, type, "
            "paired, frag_size, frag_stddev, insert_size, insert_stddev, "
            "read_orientation,genomic_start, genomic_end\n")
    librariesForInLibs     = []
    librariesForInLibsDict = {}
    group_name             = 1;
    for library, libraryInfo in sorted_libraries_by_insert:
        read1       =libraryInfo["pair1"]
        read2       =libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert      = libraryInfo["insert"]
        std         = libraryInfo["std"]
        if orientation=="innie":
            path, fqfile=os.path.split(read1)
            if "_1.fastq" in fqfile:
                fqfile = fqfile.replace("_1.fastq", "_?.fastq")
            elif "_R1_" in fqfile:
                fqfile = fqfile.replace("_R1_", "_R?_")
            else:
                print("error file format not supported {}".format(fqfile))
                return sample_config
            inGroups_file.write("PE{}, lib{}, {}\n".format(group_name, insert,
                os.path.join(path, fqfile)))
            group_name += 1
            if insert not in librariesForInLibsDict:
                librariesForInLibsDict[insert] = insert
                librariesForInLibs.append("lib{}, genome, genome, fragment, 1, "
                        "{}, {}, , , inward, 0, 0\n".format(insert,insert, std))
        elif orientation=="outtie":
            path, fqfile = os.path.split(read1)
            if "_1.fastq" in fqfile:
                fqfile = fqfile.replace("_1.fastq", "_?.fastq")
            elif "_R1_" in fqfile:
                fqfile = fqfile.replace("_R1_", "_R?_")
            else:
                print("error file format not supported {}".format(file))
                return sample_config
            inGroups_file.write("MP{}, lib{}, {}\n".format(group_name, insert,
                os.path.join(path, fqfile)))
            group_name += 1
            if insert not in librariesForInLibsDict:
                librariesForInLibsDict[insert] = insert
                librariesForInLibs.append("lib{}, genome, genome, fragment, 1, "
                        ", , {}, {}, outward, 0, 0\n".format(insert,insert, std))
        else:
            print("all paths support only innies and outties")
    inGroups_file.close()
    for lib in librariesForInLibs:
        inLibs_file.write(lib)
    inLibs_file.close()
    #NOW RUN ALLPATHS FOR REAL
    program=os.path.join(programBIN, "PrepareAllPathsInputs.pl")
    os.mkdir("data_dir")
    data_dir = os.path.join(assemblyDirectory, "data_dir")
    ploidy = "PLOIDY=1"
    if len(program_options) > 0:
        if len(program_options) >1:
            print("Running ALlpaths only one parameter accepted as option",
                    "here: PLOIDY=2")
            return sample_config
        if program_options[0] == "PLOIDY=2":
            ploidy = "PLOIDY=2"
        else:
            print("Running ALlpaths only one parameter accepted as option",
                    "here: PLOIDY=2")
            return sample_config

    command = [program , "DATA_DIR={}".format(data_dir), ploidy,
            "PICARD_TOOLS_DIR={}".format(
            global_config["Tools"]["picard"]["bin"]),  
            "FORCE_PHRED=True", "PHRED_64=False",
            "IN_GROUPS_CSV={}".format(os.path.join(assemblyDirectory,"in_groups.csv")),
            "IN_LIBS_CSV={}".format(os.path.join(assemblyDirectory,"in_libs.csv"))]
    if common.check_dryrun(sample_config):
        common.print_command(command)
        program = os.path.join(programBIN, "RunAllPathsLG")
        command = [program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.", "DATA_SUBDIR=data_dir",
                "RUN=allpaths", "SUBDIR=run"]
        common.print_command(command)
        os.chdir("..")
        return sample_config
    assembler_stdOut = open("allpaths_PrepareAllPathsInputs.stdOut", "w")
    assembler_stdErr = open("allpaths_PrepareAllPathsInputs.stdErr", "w")
    common.print_command(command)
    returnValue = subprocess.call(command,  stdout=assembler_stdOut, 
            stderr=assembler_stdErr)
    assembler_stdOut.close()
    assembler_stdErr.close()
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        program = os.path.join(programBIN, "RunAllPathsLG")
        command = [program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.", "DATA_SUBDIR=data_dir",
                "RUN=allpaths", "SUBDIR=run", "HAPLOIDIFY=True"]
        common.print_command(command)
        assembler_stdOut = open("allpaths_RunAllPathsLG.stdOut", "w")
        assembler_stdErr = open("allpaths_RunAllPathsLG.stdErr", "w")
        returnValue = subprocess.call(command,  stdout=assembler_stdOut,
                stderr=assembler_stdErr)
        if returnValue != 0:
            print("ALLPATHS RunAllPathsLG terminated with an error. Please",
                    "check running folder for more informations")
            os.chdir("..")
            return sample_config
        else: # save results
            assembly_dir = os.path.join("data_dir", "allpaths", "ASSEMBLIES",
                    "run")
            if os.path.exists(os.path.join(assembly_dir,
                "final.assembly.fasta")):
                exit_code = subprocess.call(["cp", os.path.join(assembly_dir,
                    "final.contigs.fasta"), "{}.ctg.fasta".format(outputName)])
                exit_code += subprocess.call(["cp", os.path.join(assembly_dir,
                    "final.assembly.fasta"), "{}.scf.fasta".format(outputName)])
                if not "keep_tmp_files" in flags and exit_code == 0:
                    subprocess.call(["rm", "-r", "data_dir"])
            else:
                print("something wrong with Allpaths > no contig file generated")
                os.chdir("..")
                return sample_config
    else:
        print("ALLPATHS PrepareAllPathInputs terminated with an error. "
                "Please check running folder for more informations")
        os.chdir("..")
        return sample_config
    os.chdir("..")
    return sample_config
Example #18
0
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler                  = "abyss"
    outputName                 = sample_config["output"]
    currentDirectory           = os.getcwd()
    assemblyDirectory          = os.path.join(currentDirectory, assembler)
    # in abyss case there is no exectuable
    programBIN                 = global_config["Tools"][assembler]["bin"]
    program_options            = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
    if _prepare_folder_structure("abyss", assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART
    assembler_stdOut = open("abyss.stdOut", "a")
    assembler_stdErr = open("abyss.stdErr", "a")
    program=os.path.join(programBIN, "abyss-pe")

    command = ""
    command += "{} ".format(program)
    threads = 8 # default for UPPMAX
    if "threads" in sample_config :
        threads = sample_config["threads"]
    command += "np={} ".format(threads)
    kmer = 54
    if "kmer" in sample_config:
        kmer = sample_config["kmer"]
    command += "k={} ".format(kmer)

    libraries = {}
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation=="innie" or orientation=="none":
            if read2 is None:
                # check if this is the first time I insert a se file
                if "se" not in libraries:
                    libraries["se"] = "se=\'"
                libraries["se"] = libraries["se"] + read1
            else:
                if not "lib" in libraries:
                    libraries["lib"] = {}
                libName = insert # lib name is the insert size
                if not libName in libraries["lib"]:
                    libraries["lib"][libName] = ""
                libraries["lib"][libName] +=  "{} {} ".format(read1, read2)
        else:
            if not "mp" in libraries:
                libraries["mp"] = {}
            libName = format(insert)
            if not libName in libraries["mp"]:
                libraries["mp"][libName] = ""
            libraries["mp"][libName] +=  "{} {} ".format(read1, read2)
    #now create the command
    command += "name={} ".format(outputName)
    librariesSE = ""
    librariesPE = ""
    librariesMP = ""
    if "se" in libraries:
        libraries["se"] = libraries["se"] + "\'"
        librariesSE = libraries["se"]
    if "lib" in libraries:
        lib="lib=\'"
        for libPE, libPEreads in sorted(libraries["lib"].items()):
            lib = lib + "lib{} ".format(libPE)
            librariesPE += " lib{}=\'{}\' ".format(libPE,libPEreads)
        lib=lib + "\' "
        command += "{} ".format(lib)
    if "mp" in libraries:
        mp="mp=\'"
        for libMP, libMPreads in sorted(libraries["mp"].items()):
            mp = mp + "lib{} ".format(libMP)
            librariesMP += " lib{}=\'{}\' ".format(libMP,libMPreads)
        mp=mp + "\' "
        command += "{} ".format(mp)

    command += "{} ".format(librariesSE)
    command += "{} ".format(librariesPE)
    command += "{} ".format(librariesMP)

    common.print_command(command)
    if common.check_dryrun(sample_config):
        os.chdir("..")
        return sample_config

    os.makedirs(os.path.join(assemblyDirectory, "runABySS"))
    os.chdir("runABySS")
    returnValue = 0
    returnValue = subprocess.call(command, stdout=assembler_stdOut,
            stderr=assembler_stdErr, shell=True)
    os.chdir("..")
    flags = sample_config.get("flags", [])
    if returnValue == 0 and not common.check_dryrun(sample_config):
        if os.path.exists(os.path.join("runABySS","{}-contigs.fa".format(
            outputName))):
            subprocess.call(["cp", os.path.join("runABySS",
                "{}-contigs.fa".format(outputName)),
                "{}.ctg.fasta".format(outputName) ])
            subprocess.call(["cp", os.path.join("runABySS",
                "{}-scaffolds.fa".format(outputName)),
                "{}.scf.fasta".format(outputName) ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runABySS"])
        elif not common.check_dryrun(sample_config):
            print("something wrong with ABySS -> no contig file generated")
            return sample_config
    else:
        print("ABySS terminated with an error. Please check running folder",
                "for more informations")
    os.chdir("..")
    return sample_config
Example #19
0
def _run_qc_report(global_config, sample_config, delivery_folder):
    """This function produces a pdf report and stores the important \
            resutls in a single folder"""

    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    ### retrive all info needed to write the report
    sampleName = "sample"
    if "output" in sample_config:
        sampleName = sample_config["output"]
    projectName = "anonymous_project"
    if "projectName" in sample_config:
        projectName = sample_config["projectName"]

    currentDir = os.getcwd()
    workingDir = os.path.join(currentDir, sampleName)
    #create delivery dir for this sample
    sample_delivery_dir = os.path.join(delivery_folder, sampleName)
    if not os.path.exists(sample_delivery_dir):
        os.makedirs(sample_delivery_dir)

    reportDir = os.path.join(sample_delivery_dir, "report")
    if not os.path.exists(reportDir):
        os.makedirs(reportDir)

    PDFtitle = os.path.join(sample_delivery_dir, "report",
                            "{}.pdf".format(sample_config["output"]))

    # this you cannot do in rLab which is why I wrote the helper initially
    TABLE_WIDTH = 540

    class MyTheme(DefaultTheme):
        doc = {
            'leftMargin': 25,
            'rightMargin': 25,
            'topMargin': 20,
            'bottomMargin': 25,
            'allowSplitting': False
        }

    # let's create the doc and specify title and author
    doc = pdf.Pdf('{} {}'.format(projectName, sampleName),
                  'NGI-Stockholm, Science for Life Laboratory')

    # now we apply our theme
    doc.set_theme(MyTheme)
    # give me some space
    doc.add_spacer()
    # this header defaults to H1
    scriptDirectory = os.path.split(os.path.abspath(__file__))[0]
    logo_path = os.path.join(scriptDirectory, '../pictures/ngi_scilife.png')
    doc.add_image(logo_path, 540, 50, pdf.CENTER)
    # give me some space
    doc.add_spacer()

    doc.add_header('NGI-Stockholm -- Science For Life Laboratory')
    doc.add_header('Best-practice analysis for quality checking report')
    doc.add_header('{} -- {}'.format(projectName, sampleName))
    # give me some space
    doc.add_spacer()
    doc.add_paragraph(
        "For sample {} belonging to the project {} "
        "NGI-Stockholm best-practice analysis for quality checking has "
        "been performed. For mate pair libraries produced with Nextera, "
        "best-practice analysis described at this address has been "
        "performed: http://res.illumina.com/documents/products/technotes/"
        "technote_nextera_matepair_data_processing.pdf".format(
            sampleName, projectName))
    doc.add_spacer()
    tools = ["trimmomatic", "fastqc", "abyss", "align", "kmergenie"]
    if "tools" in sample_config and len(sample_config["tools"]) > 0:
        tools = sample_config["tools"]
    doc.add_paragraph("The following tools have been employed \
            (tools are listed in order of execution):")
    bollet_list = []
    for tool in tools:
        if tool != "align":
            program_path = global_config["Tools"][tool]["bin"]
            bollet_list.append("{} : {}".format(tool, program_path))
        else:
            bollet_list.append("{} : {}".format(tool, \
                    global_config["Tools"]["bwa"]["bin"]))
            bollet_list.append("{} : {}".format(tool, \
                    global_config["Tools"]["samtools"]["bin"]))
            bollet_list.append("{} : {}".format(tool, \
                    global_config["Tools"]["picard"]["bin"]))
    doc.add_list(bollet_list)
    doc.add_spacer()
    doc.add_paragraph(
        "The results from each tool is reported in the "
        "following sections. Moreover you will find all the results and "
        "commands that have been run in the delivery folder on Uppmax")

    for tool in tools:
        doc.add_pagebreak()
        doc.add_header(tool.title(), pdf.H2)
        if tool == "trimmomatic":
            doc.add_paragraph(
                "Reads (both paired and mate pairs) can "
                "contain parts of the adapter sequence or, in the case of "
                "mate pairs, part of the linker sequence. Illumina "
                "recommends to remove the adapter before use of the reads "
                "in any downstream analysis (this is mandatory for mate "
                "pairs).")
            doc.add_paragraph("Adapter sequences removed are:")
            adapter_file = sample_config["adapters"]
            adapters = []
            with open(adapter_file) as afile:
                lines = afile.readlines()
                for index in range(1, len(lines), 2):
                    adapters.append(lines[index].rstrip())
            doc.add_list(adapters)
            doc.add_spacer()

            trimmomatic_table_part1 = [[
                sampleName, "#orig_pairs", "#survived_pairs"
            ]]  # this is the header row
            trimmomatic_table_part2 = [[
                sampleName, "#survived_fw_only", "#survived_rv_only",
                "#discarded"
            ]]

            total_orig_pairs = 0
            total_survived_pairs = 0
            total_survived_fw_only = 0
            total_survived_rv_only = 0
            total_discarded = 0

            for library, libraryInfo in sorted_libraries_by_insert:
                runName = os.path.basename(libraryInfo["trimmomatic"]).split(
                    "_1_trimmomatic.stdErr")[0]
                with open(libraryInfo["trimmomatic"]) as trimmomatic_output:
                    lines = trimmomatic_output.readlines()
                    result_line = lines[-2].rstrip()
                    match_string = re.compile(
                        "Input Read Pairs: (\d+) Both "
                        "Surviving: (\d+) \(.+\) Forward Only Surviving: "
                        "(\d+) \(.+\) Reverse Only Surviving: (\d+) \(.+\) "
                        "Dropped: (\d+) \(.+\)")
                    read_pairs = int(match_string.match(result_line).group(1))
                    survived_pairs = int(
                        match_string.match(result_line).group(2))
                    survived_fw_only = int(
                        match_string.match(result_line).group(3))
                    survived_rv_only = int(
                        match_string.match(result_line).group(4))
                    discarded = int(match_string.match(result_line).group(5))
                    read_pairs_perc = "({0:.0f}%)".format(
                        (float(survived_pairs) / read_pairs) * 100)
                    survived_fw_only_perc = "({0:.0f}%)".format(
                        (float(survived_fw_only) / read_pairs) * 100)
                    survived_rv_only_perc = "({0:.0f}%)".format(
                        (float(survived_rv_only) / read_pairs) * 100)
                    survived_discarded_perc = "({0:.0f}%)".format(
                        (float(discarded) / read_pairs) * 100)

                    total_orig_pairs += read_pairs
                    total_survived_pairs += survived_pairs
                    total_survived_fw_only += survived_fw_only
                    total_survived_rv_only += survived_rv_only
                    total_discarded += discarded
                # these are the other rows
                trimmomatic_table_part1.append([
                    runName, read_pairs,
                    "{} {}".format(survived_pairs, read_pairs_perc)
                ])
                trimmomatic_table_part2.append([
                    runName, "{} {}".format(survived_fw_only,
                                            survived_fw_only_perc),
                    "{} {}".format(survived_rv_only, survived_rv_only_perc),
                    "{} {}".format(discarded, survived_discarded_perc)
                ])
            survived_pairs_perc = "({0:.0f}%)".format(
                (float(total_survived_pairs) / total_orig_pairs) * 100)
            survived_survived_fw_only_perc = "({0:.0f}%)".format(
                (float(total_survived_fw_only) / total_orig_pairs) * 100)
            survived_survived_rv_only_perc = "({0:.0f}%)".format(
                (float(total_survived_rv_only) / total_orig_pairs) * 100)
            survived_discarded_perc = "({0:.0f}%)".format(
                (float(total_discarded) / total_orig_pairs) * 100)
            trimmomatic_table_part1.append([
                "total", total_orig_pairs,
                "{} {}".format(total_survived_pairs, survived_pairs_perc)
            ])
            # last row is the sum
            trimmomatic_table_part2.append([
                "total", "{} {}".format(survived_fw_only,
                                        survived_fw_only_perc),
                "{} {}".format(survived_rv_only, survived_rv_only_perc),
                "{} {}".format(discarded, survived_discarded_perc)
            ])
            doc.add_table(trimmomatic_table_part1, TABLE_WIDTH)
            doc.add_spacer()
            doc.add_table(trimmomatic_table_part2, TABLE_WIDTH)
            ##now save the trimmed reads
            trimmomaticDir = os.path.split(libraryInfo["trimmomatic"])[0]
            trimmomaticResultDir = os.path.join(sample_delivery_dir,
                                                "fastq_trimmed")
            if not os.path.exists(trimmomaticResultDir):
                os.makedirs(trimmomaticResultDir)
            filesToCopy = [os.path.join(trimmomaticDir, f) for f in \
                    os.listdir(trimmomaticDir) \
                    if (os.path.isfile(os.path.join(trimmomaticDir,f)) \
                    and re.search('.gz$',f))]
            for source in filesToCopy:
                dest = os.path.join(trimmomaticResultDir,
                                    os.path.split(source)[1])
                if not os.path.isfile(dest):
                    shutil.copyfile(source, dest)

        if tool == "fastqc" and "fastqc" in sample_config:
            fastqc_dir = sample_config["fastqc"]
            for fastqc_run in [dir for dir in os.listdir(fastqc_dir) \
                    if os.path.isdir(os.path.join(fastqc_dir, dir))]:
                fastqc_run_dir = os.path.join(fastqc_dir, fastqc_run, "Images")
                doc.add_image(
                    os.path.join(fastqc_run_dir,
                                 "per_base_quality.png"), 400, 180, pdf.CENTER,
                    "{} -- Per Base Quality".format(fastqc_run))
                fastqc_run_dir = os.path.join(fastqc_dir, fastqc_run, "Images")
                doc.add_image(
                    os.path.join(fastqc_run_dir,
                                 "sequence_length_distribution.png"), 400, 180,
                    pdf.CENTER,
                    "{} -- Sequence Length Distribution".format(fastqc_run))
            #If I have not yet copied fastqc results do it
            fastqcResultDir = os.path.join(sample_delivery_dir, "fastqc")
            if not os.path.exists(fastqcResultDir):
                os.makedirs(fastqcResultDir)
            dirsToBeCopied = [os.path.join(fastqc_dir, f) for f in \
                    os.listdir(fastqc_dir) \
                    if os.path.isdir(os.path.join(fastqc_dir, f))]
            for source in dirsToBeCopied:
                dest = os.path.join(fastqcResultDir, os.path.split(source)[1])
                if not os.path.exists(dest):
                    shutil.copytree(source, dest)

        if tool == "abyss" and "abyss" in sample_config:
            doc.add_paragraph(
                "A possible way to assess the complexity of a "
                "library even in absence of a reference sequence is to "
                "look at the kmer profile of the reads. The idea is to "
                "count all the kmers (i.e., sequence of length k) that occur "
                "in the reads. In this way it is possible to know how many "
                "kmers occur 1,2,..., N times and represent this as a "
                "plot. This plot tell us for each x, how many k-mers "
                "(y-axis) are present in the dataset in exactly x-copies. "
                "In an ideal world (no errors in sequencing, no bias, no "
                "repeating regions) this plot should be as close as "
                "possible to a gaussian distribution. In reality we will "
                "always see a peak for x=1 (i.e., the errors) and another "
                "peak close to the expected coverage. If the genome is "
                "highly heterozygous a second peak at half of the coverage "
                "can be expected.")
            kmer_1_200 = os.path.join(sample_config["abyss"],
                                      "kmer_coverage.png")
            doc.add_image(
                kmer_1_200, 500, 300, pdf.CENTER,
                "kmer profile with k={}.".format(sample_config["kmer"]))
            #copy the results in resutls
            if not os.path.exists("kmer_analysis"):
                os.mkdir("kmer_analysis")
            kmerDir = sample_config["abyss"]
            filesToCopy = [os.path.join(kmerDir, f) for f in \
                    os.listdir(kmerDir) \
                    if (os.path.isfile(os.path.join(kmerDir,f)) \
                    and re.search('.png$',f))]
            filesToCopy.append(os.path.join(kmerDir, "histogram.hist"))
            abyssResultDir = os.path.join(sample_delivery_dir, "kmer_analysis")
            if not os.path.exists(abyssResultDir):
                os.makedirs(abyssResultDir)
            for source in filesToCopy:
                dest = os.path.join(abyssResultDir, os.path.split(source)[1])
                if not os.path.exists(dest):
                    shutil.copyfile(source, dest)

        if tool == "align" and "alignments" in sample_config:
            alignments = sample_config["alignments"][0]
            alignment_path = alignments[1]
            alignment_prefix = alignments[2]
            align_dir = os.path.split(alignment_path)[0]
            doc.add_header(
                "{} -- Collect Insert Size Metrics".format(sampleName), pdf.H3)
            with open(os.path.join(align_dir,
                "{}.collectInsertSize.txt".format(alignment_prefix))) \
                as collectInsertSize:
                lines = collectInsertSize.readlines()
                line = lines[6].rstrip().split("\t")
                # this is the header row
                insertSize_table = [[line[7], line[6], line[4], line[5]]]
                line = lines[7].rstrip().split("\t")
                # this is the header row
                insertSize_table.append([line[7], line[6], line[4], line[5]])
                line = lines[8].rstrip().split("\t")
                # this is the header row
                insertSize_table.append([line[7], line[6], line[4], line[5]])
                line = lines[9].rstrip().split("\t")
                # this is the header row
                insertSize_table.append([line[7], line[6], line[4], line[5]])
                doc.add_table(insertSize_table, TABLE_WIDTH)
            doc.add_spacer()
            full_path_to_pdf = os.path.join(
                align_dir, "{}.collectInsertSize.pdf".format(alignment_prefix))
            doc.add_paragraph("Insert size plot can be found in the result \
                    directory: {}".format(
                os.path.join(
                    "alignments",
                    "{}.collectInsertSize.pdf".format(alignment_prefix))))
            doc.add_spacer()
            doc.add_header("{} -- Duplicate Metrics".format(sampleName),
                           pdf.H3)
            with open(os.path.join(align_dir,
                "{}.markDuplicates.txt".format(alignment_prefix))) as \
                collectInsertSize:
                lines = collectInsertSize.readlines()
                line = lines[6].rstrip().split("\t")
                # this is the header row
                duplication_table_part1 = [line[0:3]]
                duplication_table_part2 = [line[4:6]]
                duplication_table_part3 = [line[7:9]]
                line = lines[7].rstrip().split("\t")
                duplication_table_part1.append(line[0:3])
                duplication_table_part2.append(line[4:6])
                duplication_table_part3.append(line[7:9])
            doc.add_table(duplication_table_part1, TABLE_WIDTH)
            doc.add_spacer()
            doc.add_table(duplication_table_part2, TABLE_WIDTH)
            doc.add_spacer()
            doc.add_table(duplication_table_part3, TABLE_WIDTH)
            doc.add_spacer()
            full_path_to_bam = os.path.join(
                align_dir, "{}_noDup.bam".format(alignment_prefix))
            doc.add_paragraph("Bam file with marked duplicate reads can be \
                    found at: {}".format(
                os.path.join("alignments",
                             "{}_noDup.bam".format(alignment_prefix))))
            doc.add_spacer()
            #copy the results in resutls
            if not os.path.exists("alignments"):
                os.mkdir("alignments")
            filesToCopy = [os.path.join(align_dir, f) for f in \
                    os.listdir(align_dir) \
                    if (os.path.isfile(os.path.join(align_dir,f)) \
                    and re.search('{}'.format(alignment_prefix),f))]
            alignmentResultDir = os.path.join(sample_delivery_dir,
                                              "alignments")
            if not os.path.exists(alignmentResultDir):
                os.makedirs(alignmentResultDir)
            for source in filesToCopy:
                dest = os.path.join(alignmentResultDir,
                                    os.path.split(source)[1])
                if not os.path.exists(dest):
                    shutil.copyfile(source, dest)

        if tool == "kmergenie" and "kmergenie" in sample_config:
            doc.add_paragraph(
                "Assemblers using a de Bruijn graph strategy "
                "for contig construction (such as Velvet, ABySS and "
                "SOAPdenovo) fractures the reads into k-sized substrings "
                "(k-mers). The k-mer size is vital for the performance of "
                "these assemblers, and is usually selected considering "
                "several trade-offs between the size and accuracy of the "
                "produced contigs. Some assemblers choose the k-mer size "
                "automatically or builds several assemblies (using "
                "different k-mers) and / or relies on user input. "
                "Kmergenie is a lightweight program that suggests a best "
                "k-mer size based on their relative abundance in the "
                "genomic reads.")
            kmerdir = sample_config["kmergenie"]
            doc.add_image(os.path.join(kmerdir, "histograms.dat.png"), 400,
                          300, pdf.CENTER,
                          ("The plot should be roughly concave and have "
                           "a clear global maximum, if not the predicted best "
                           "k is likely to be inaccurate"))
            #copy everything to results
            kmergenieResultDir = os.path.join(sample_delivery_dir, "kmergenie")
            dest = kmergenieResultDir
            if not os.path.exists(dest):
                shutil.copytree(kmerdir, dest)
    doc.render(PDFtitle)

    # Copy the pipeline files and commands run to the report directory
    filesToCopy = glob.glob(currentDir + "/{}_QCcontrol.*".format(sampleName))
    for cfile in filesToCopy:
        shutil.copyfile(cfile, os.path.join(reportDir,
                                            os.path.basename(cfile)))

    with open(os.path.join(reportDir, "commands.txt"), "w") as f:
        f.write(sample_config.get("commands", ""))

    os.chdir(currentDir)
Example #20
0
def _run_allpaths(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "allpaths"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in abyss case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    if _prepare_folder_structure("allpaths", assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    inGroups_file = open("in_groups.csv", "w")
    inLibs_file = open("in_libs.csv", "w")
    inGroups_file.write("group_name, library_name, file_name\n")
    inLibs_file.write(
        "library_name, project_name, organism_name, type, "
        "paired, frag_size, frag_stddev, insert_size, insert_stddev, "
        "read_orientation,genomic_start, genomic_end\n")
    librariesForInLibs = []
    librariesForInLibsDict = {}
    group_name = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation == "innie":
            path, fqfile = os.path.split(read1)
            if "_1.fastq" in fqfile:
                fqfile = fqfile.replace("_1.fastq", "_?.fastq")
            elif "_R1_" in fqfile:
                fqfile = fqfile.replace("_R1_", "_R?_")
            else:
                print("error file format not supported {}".format(fqfile))
                return sample_config
            inGroups_file.write("PE{}, lib{}, {}\n".format(
                group_name, insert, os.path.join(path, fqfile)))
            group_name += 1
            if insert not in librariesForInLibsDict:
                librariesForInLibsDict[insert] = insert
                librariesForInLibs.append(
                    "lib{}, genome, genome, fragment, 1, "
                    "{}, {}, , , inward, 0, 0\n".format(insert, insert, std))
        elif orientation == "outtie":
            path, fqfile = os.path.split(read1)
            if "_1.fastq" in fqfile:
                fqfile = fqfile.replace("_1.fastq", "_?.fastq")
            elif "_R1_" in fqfile:
                fqfile = fqfile.replace("_R1_", "_R?_")
            else:
                print("error file format not supported {}".format(file))
                return sample_config
            inGroups_file.write("MP{}, lib{}, {}\n".format(
                group_name, insert, os.path.join(path, fqfile)))
            group_name += 1
            if insert not in librariesForInLibsDict:
                librariesForInLibsDict[insert] = insert
                librariesForInLibs.append(
                    "lib{}, genome, genome, fragment, 1, "
                    ", , {}, {}, outward, 0, 0\n".format(insert, insert, std))
        else:
            print("all paths support only innies and outties")
    inGroups_file.close()
    for lib in librariesForInLibs:
        inLibs_file.write(lib)
    inLibs_file.close()
    #NOW RUN ALLPATHS FOR REAL
    program = os.path.join(programBIN, "PrepareAllPathsInputs.pl")
    os.mkdir("data_dir")
    data_dir = os.path.join(assemblyDirectory, "data_dir")
    ploidy = "PLOIDY=1"
    if len(program_options) > 0:
        if len(program_options) > 1:
            print("Running ALlpaths only one parameter accepted as option",
                  "here: PLOIDY=2")
            return sample_config
        if program_options[0] == "PLOIDY=2":
            ploidy = "PLOIDY=2"
        else:
            print("Running ALlpaths only one parameter accepted as option",
                  "here: PLOIDY=2")
            return sample_config

    command = [
        program, "DATA_DIR={}".format(data_dir), ploidy,
        "PICARD_TOOLS_DIR={}".format(global_config["Tools"]["picard"]["bin"]),
        "FORCE_PHRED=True", "PHRED_64=False", "IN_GROUPS_CSV={}".format(
            os.path.join(assemblyDirectory, "in_groups.csv")),
        "IN_LIBS_CSV={}".format(os.path.join(assemblyDirectory, "in_libs.csv"))
    ]
    if common.check_dryrun(sample_config):
        common.print_command(command)
        program = os.path.join(programBIN, "RunAllPathsLG")
        command = [
            program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.",
            "DATA_SUBDIR=data_dir", "RUN=allpaths", "SUBDIR=run"
        ]
        common.print_command(command)
        os.chdir("..")
        return sample_config
    assembler_stdOut = open("allpaths_PrepareAllPathsInputs.stdOut", "w")
    assembler_stdErr = open("allpaths_PrepareAllPathsInputs.stdErr", "w")
    common.print_command(command)
    returnValue = subprocess.call(command,
                                  stdout=assembler_stdOut,
                                  stderr=assembler_stdErr)
    assembler_stdOut.close()
    assembler_stdErr.close()
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        program = os.path.join(programBIN, "RunAllPathsLG")
        command = [
            program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.",
            "DATA_SUBDIR=data_dir", "RUN=allpaths", "SUBDIR=run",
            "HAPLOIDIFY=True"
        ]
        common.print_command(command)
        assembler_stdOut = open("allpaths_RunAllPathsLG.stdOut", "w")
        assembler_stdErr = open("allpaths_RunAllPathsLG.stdErr", "w")
        returnValue = subprocess.call(command,
                                      stdout=assembler_stdOut,
                                      stderr=assembler_stdErr)
        if returnValue != 0:
            print("ALLPATHS RunAllPathsLG terminated with an error. Please",
                  "check running folder for more informations")
            os.chdir("..")
            return sample_config
        else:  # save results
            assembly_dir = os.path.join("data_dir", "allpaths", "ASSEMBLIES",
                                        "run")
            if os.path.exists(
                    os.path.join(assembly_dir, "final.assembly.fasta")):
                exit_code = subprocess.call([
                    "cp",
                    os.path.join(assembly_dir, "final.contigs.fasta"),
                    "{}.ctg.fasta".format(outputName)
                ])
                exit_code += subprocess.call([
                    "cp",
                    os.path.join(assembly_dir, "final.assembly.fasta"),
                    "{}.scf.fasta".format(outputName)
                ])
                if not "keep_tmp_files" in flags and exit_code == 0:
                    subprocess.call(["rm", "-r", "data_dir"])
            else:
                print(
                    "something wrong with Allpaths > no contig file generated")
                os.chdir("..")
                return sample_config
    else:
        print("ALLPATHS PrepareAllPathInputs terminated with an error. "
              "Please check running folder for more informations")
        os.chdir("..")
        return sample_config
    os.chdir("..")
    return sample_config