Example #1
0
def picard_CGbias(global_config, sample_config, sorted_alignments_by_insert):
    picard = "";
    if os.environ.get('PICARD_HOME'):
        picard = os.environ.get('PICARD_HOME')
    elif "picard" in global_config["Tools"]:
        picard = global_config["Tools"]["picard"]["bin"]
    for library, BAMfile, working_dir in sorted_alignments_by_insert:
        os.chdir(working_dir)
        output_header = os.path.basename(BAMfile).split(".bam")[0]
        command= ["java", "-Xmx16g", "-XX:PermSize=2g", "-jar",
                os.path.join(picard, "CollectGcBiasMetrics.jar"),
                "REFERENCE_SEQUENCE={}".format(sample_config["reference"]),
                "INPUT={}".format(BAMfile), \
                "OUTPUT={}.collectGcBias.txt".format(output_header),
                "CHART_OUTPUT={}.collectGcBias.pdf".format(output_header),
                "ASSUME_SORTED=true", "VALIDATION_STRINGENCY=LENIENT",
                "TMP_DIR=$TMPDIR"]
        returnValue = 0;
        common.print_command(command)
        if not os.path.exists("{}.collectGcBias.pdf".format(output_header)):
            if not common.check_dryrun(sample_config):
                stdOut = open("collectGcBias.stdOut", "w")
                stdErr = open("collectGcBias.stdErr", "w")
                returnValue = subprocess.call(command, stdout=stdOut,
                        stderr=stdErr)
                if not returnValue == 0:
                    print("problem running collectGCBias")
        os.chdir("..")
    return sorted_alignments_by_insert
Example #2
0
def picard_markDuplicates(global_config, sample_config,
        sorted_alignments_by_insert):
    picard = "";
    if os.environ.get('PICARD_HOME'):
        picard = os.environ.get('PICARD_HOME')
    elif "picard" in global_config["Tools"]:
        picard = global_config["Tools"]["picard"]["bin"]
    for library, BAMfile, working_dir in sorted_alignments_by_insert:
        os.chdir(working_dir)
        output_header = os.path.basename(BAMfile).split(".bam")[0]
        command= ["java", "-Xmx16g", "-XX:PermSize=3g", "-jar",
                os.path.join(picard, "MarkDuplicates.jar"),
                "INPUT={}".format(BAMfile), "OUTPUT={}_noDup.bam".format(
                output_header),"METRICS_FILE={0}.markDuplicates.txt".format(
                output_header), "ASSUME_SORTED=true",
                "VALIDATION_STRINGENCY=LENIENT", "TMP_DIR=$TMPDIR"]
        returnValue = 0;
        common.print_command(command)
        if not os.path.exists("{}.markDuplicates.txt".format(output_header)):
            if not common.check_dryrun(sample_config):
                stdOut = open("removeDup.stdOut", "w")
                stdErr = open("removeDup.stdErr", "w")
                returnValue = subprocess.call(command, stdout=stdOut,
                        stderr=stdErr)
                if not returnValue == 0:
                    print("problem running MarkDuplicates")
        os.chdir("..")
    return sorted_alignments_by_insert
Example #3
0
def _run_BUSCO(global_config, sample_config, sorted_alignments_by_insert):
    program = global_config["Tools"]["BUSCO"]["bin"]
    options = global_config["Tools"]["BUSCO"]["options"]
    main_dir = os.getcwd()
    BUSCOfolder = os.path.join(main_dir, "BUSCO")
    if not os.path.exists(BUSCOfolder):
        os.makedirs(BUSCOfolder)
    os.chdir(BUSCOfolder)

    BUSCO_data_path = sample_config["BUSCODataPath"]
    if not os.path.exists(BUSCO_data_path):
        raise IOError("Path to the BUSCO data set does not exist!")

    reference = sample_config["reference"]
    output = sample_config["output"]
    threads = sample_config.get("threads", 16)
    command = [program, "-l", BUSCO_data_path, "-in", "{}".format(reference), "-o", "{}".format(output), 
            "-c", "{}".format(threads)]
    command.extend(options)
    common.print_command(command)

    outfile = os.path.join(BUSCOfolder, "run_{}".format(output), 
            "short_summary_{}".format(output))
    if not common.check_dryrun(sample_config) and not os.path.exists(outfile):
        stdOut = open("BUSCO.stdOut", "a")
        stdErr = open("BUSCO.stdErr", "a")
        return_value = subprocess.call(command, stdout=stdOut, stderr=stdErr) 
        if not return_value == 0:
            sys.exit("Error running BUSCO")
    os.chdir("..")
Example #4
0
def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    FastqcFolder = os.path.join(os.getcwd(), "fastqc")
    if not os.path.exists(FastqcFolder):
        os.makedirs(FastqcFolder)

    program = global_config["Tools"]["fastqc"]["bin"]
    program_options = global_config["Tools"]["fastqc"]["options"]
    for library, libraryInfo in sorted_libraries_by_insert:
        command = [program]
        for option in program_options:
            command.append(option)
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        command.append(read1)
        if read2 is not None:
            command.append(read2)
        common.print_command(command)
        sample_config["commands"] += "\n" + common.get_command_str(command)
        folder_output_name = os.path.join(
            FastqcFolder,
            os.path.basename(read1).split(".fastq.gz")[0])
        if not common.check_dryrun(sample_config) and not \
                os.path.exists("{}_fastqc.zip".format(folder_output_name)):
            fastq_stdOut = open(
                os.path.join(FastqcFolder, "{}_fastqc.stdout".format(library)),
                "a")
            fastq_stdErr = open(
                os.path.join(FastqcFolder, "{}_fastqc.stderr".format(library)),
                "a")
            subprocess.call(command, stdout=fastq_stdOut, stderr=fastq_stdErr)
    sample_config["fastqc"] = FastqcFolder
    return sample_config
Example #5
0
def picard_collectInsertSizeMetrics(global_config, sample_config,
        sorted_alignments_by_insert):
    picard = "";
    if os.environ.get('PICARD_HOME'):
        picard = os.environ.get('PICARD_HOME')
    elif "picard" in global_config["Tools"]:
        picard = global_config["Tools"]["picard"]["bin"]
    for library, BAMfile, working_dir in sorted_alignments_by_insert:
        os.chdir(working_dir)
        output_header = os.path.basename(BAMfile).split(".bam")[0]
        histWide = library * 2
        command= ["java", "-Xmx16g", "-XX:PermSize=2g", "-jar",
                os.path.join(picard, "CollectInsertSizeMetrics.jar"),
                "INPUT={}".format(BAMfile), "MINIMUM_PCT=0",
                "HISTOGRAM_FILE={}.collectInsertSize.pdf".format(
                output_header),
                "OUTPUT={}.collectInsertSize.txt".format(output_header),
                "HISTOGRAM_WIDTH={}".format(histWide),
                "VALIDATION_STRINGENCY=LENIENT", "TMP_DIR=$TMPDIR"]
        returnValue = 0;
        common.print_command(command)
        if not os.path.exists("{}.collectInsertSize.pdf".format(
            output_header)):
            if not common.check_dryrun(sample_config):
                stdOut = open("collectInsertSize.stdOut", "w")
                stdErr = open("collectInsertSize.stdErr", "w")
                returnValue = subprocess.call(command, stdout=stdOut,
                        stderr=stdErr)
                if not returnValue == 0:
                    print("problem running CollectInsertSizeMetrics")
        os.chdir("..")
    return sorted_alignments_by_insert
Example #6
0
def _run_BUSCO(global_config, sample_config, sorted_alignments_by_insert):
    program = global_config["Tools"]["BUSCO"]["bin"]
    options = global_config["Tools"]["BUSCO"]["options"]
    main_dir = os.getcwd()
    BUSCOfolder = os.path.join(main_dir, "BUSCO")
    if not os.path.exists(BUSCOfolder):
        os.makedirs(BUSCOfolder)
    os.chdir(BUSCOfolder)

    BUSCO_data_path = sample_config["BUSCODataPath"]
    if not os.path.exists(BUSCO_data_path):
        raise IOError("Path to the BUSCO data set does not exist!")

    reference = sample_config["reference"]
    output = sample_config["output"]
    threads = sample_config.get("threads", 16)
    command = [
        program, "-l", BUSCO_data_path, "-in", "{}".format(reference), "-o",
        "{}".format(output), "-c", "{}".format(threads)
    ]
    command.extend(options)
    common.print_command(command)

    outfile = os.path.join(BUSCOfolder, "run_{}".format(output),
                           "short_summary_{}".format(output))
    if not common.check_dryrun(sample_config) and not os.path.exists(outfile):
        stdOut = open("BUSCO.stdOut", "a")
        stdErr = open("BUSCO.stdErr", "a")
        return_value = subprocess.call(command, stdout=stdOut, stderr=stdErr)
        if not return_value == 0:
            sys.exit("Error running BUSCO")
    os.chdir("..")
Example #7
0
def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    FastqcFolder = os.path.join(os.getcwd(), "fastqc")
    if not os.path.exists(FastqcFolder):
        os.makedirs(FastqcFolder)

    program=global_config["Tools"]["fastqc"]["bin"]
    program_options=global_config["Tools"]["fastqc"]["options"]
    for library, libraryInfo in sorted_libraries_by_insert:
        command = [program]
        for option in program_options:
            command.append(option)
        read1=libraryInfo["pair1"]
        read2=libraryInfo["pair2"]
        command.append(read1)
        if read2 is not None:
            command.append(read2)
        common.print_command(command)
        sample_config["commands"] += "\n" + common.get_command_str(command)
        folder_output_name = os.path.join(FastqcFolder,
                os.path.basename(read1).split(".fastq.gz")[0])
        if not common.check_dryrun(sample_config) and not \
                os.path.exists("{}_fastqc.zip".format(folder_output_name)):
            fastq_stdOut = open(os.path.join(FastqcFolder,
                    "{}_fastqc.stdout".format(library)), "a")
            fastq_stdErr = open(os.path.join(FastqcFolder,
                    "{}_fastqc.stderr".format(library)), "a")
            subprocess.call(command, stdout=fastq_stdOut, stderr=fastq_stdErr)
    sample_config["fastqc"] = FastqcFolder
    return sample_config
Example #8
0
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer")
    if "kmer" not in sample_config:
        sys.exit("error in _run_abyss QCcontrol: kmer must be present in \
                sample_config.yaml")

    kmer = sample_config["kmer"]
    if not os.path.exists(ABySS_Kmer_Folder):
        os.makedirs(ABySS_Kmer_Folder)

    os.chdir(ABySS_Kmer_Folder)

    program = global_config["Tools"]["abyss"]["bin"]
    program = os.path.join(os.path.dirname(program), "ABYSS-P")
    program_options=global_config["Tools"]["abyss"]["options"]
    if "abyss" in sample_config:
        program_options=sample_config["abyss"]

    threads = 16 # default for UPPMAX
    if "threads" in sample_config :
        threads = sample_config["threads"]

    command = "mpirun -np {} {} ".format(threads, program)
    command += "-k {} ".format(kmer)
    command += "--coverage-hist=histogram.hist -o preUnitgs.fa"
    for library, libraryInfo in sorted_libraries_by_insert:
        read1=libraryInfo["pair1"]
        read2=libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        if orientation=="innie" or orientation=="outtie":
            command += " {} ".format(read1)
            if read2 is not None:
                command += " {} ".format(read2)
        if orientation == "none":
            command += " {} ".format(read1)

    common.print_command(command)
    sample_config["commands"] += "\n" + common.get_command_str(command)

    if not common.check_dryrun(sample_config) and not \
            os.path.exists("histogram.hist"):
        ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a")
        ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a")
        returnValue = subprocess.call(command, shell=True, \
                stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr)
        if returnValue > 0:
            print("ABySS kmer plotting failed: unkwnown reason")
        else :
            subprocess.call(("rm", "preUnitgs.fa"))
            _plotKmerFixed(1,200, kmer, "kmer_coverage_1_200.png")
            _plotKmerFixed(1,500, kmer, "kmer_coverage_1_500.png")
            _plotKmerFixed(15,200, kmer, "kmer_coverage_15_200.png")
            _plotKmerFixed(15,500, kmer, "kmer_coverage_15_500.png")
            _plotKmer(kmer, "kmer_coverage.png")

    os.chdir("..")
    sample_config["abyss"] = ABySS_Kmer_Folder
    return sample_config
Example #9
0
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer")
    if "kmer" not in sample_config:
        sys.exit("error in _run_abyss QCcontrol: kmer must be present in \
                sample_config.yaml")

    kmer = sample_config["kmer"]
    if not os.path.exists(ABySS_Kmer_Folder):
        os.makedirs(ABySS_Kmer_Folder)

    os.chdir(ABySS_Kmer_Folder)

    program = global_config["Tools"]["abyss"]["bin"]
    program = os.path.join(os.path.dirname(program), "ABYSS-P")
    program_options = global_config["Tools"]["abyss"]["options"]
    if "abyss" in sample_config:
        program_options = sample_config["abyss"]

    threads = 16  # default for UPPMAX
    if "threads" in sample_config:
        threads = sample_config["threads"]

    command = "mpirun -np {} {} ".format(threads, program)
    command += "-k {} ".format(kmer)
    command += "--coverage-hist=histogram.hist -o preUnitgs.fa"
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        if orientation == "innie" or orientation == "outtie":
            command += " {} ".format(read1)
            if read2 is not None:
                command += " {} ".format(read2)
        if orientation == "none":
            command += " {} ".format(read1)
    common.print_command(command)
    sample_config["commands"] += "\n" + common.get_command_str(command)

    if not common.check_dryrun(sample_config) and not \
            os.path.exists("histogram.hist"):
        ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a")
        ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a")
        returnValue = subprocess.call(command, shell=True, \
                stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr)
        if returnValue > 0:
            print("ABySS kmer plotting failed: unkwnown reason")
        else:
            subprocess.call(("rm", "preUnitgs.fa"))
            _plotKmerFixed(1, 200, kmer, "kmer_coverage_1_200.png")
            _plotKmerFixed(1, 500, kmer, "kmer_coverage_1_500.png")
            _plotKmerFixed(15, 200, kmer, "kmer_coverage_15_200.png")
            _plotKmerFixed(15, 500, kmer, "kmer_coverage_15_500.png")
            _plotKmer(kmer, "kmer_coverage.png")

    os.chdir("..")
    sample_config["abyss"] = ABySS_Kmer_Folder
    return sample_config
Example #10
0
def build_reference_bwa(global_config, sample_config):
    #build the reference if not available
    reference   = sample_config["reference"]
    program = "bwa"
    if "bwa" in global_config["Tools"]:
        program = global_config["Tools"]["bwa"]["bin"]
    elif not common.which("bwa"):
        sys.exit("error while trying to run  bwa index: bwa not present in "
                "the path and not in global config, please make sure to "
                "install bwa properly")
    # check if reference provided exisists
    reference = os.path.abspath(reference)
    path_name, base_name  = os.path.split(reference)
    index_path = os.path.join(base_name, "bwa", "{}.bwt".format(reference))
    # check if I have already the bwt index
    if os.path.exists(index_path):
        #index already present, nothing to do
        return reference
    #otherwise I need to build the reference, in this case I build it locally
    if not os.path.exists(reference):
        sys.exit("error, reference file {} does not exists".format(reference))
    # check if bwa index already created
    current_dir           = os.getcwd() 
    bwa_index_folder      = os.path.join(path_name, "bwa")
    #if needed create directory
    if not os.path.exists(bwa_index_folder):
        os.makedirs(bwa_index_folder)
    os.chdir(bwa_index_folder)
    # if needed soft link the reference
    if not os.path.exists(base_name):
        #check and remove broken links
        if os.path.lexists(base_name):
            os.remove(base_name)
        returnValue = subprocess.call(["ln", "-s", reference, base_name])
        if not returnValue == 0:
            sys.exit("error while trying to soft link reference sequence")
    # now I have a soflinked copy
    reference = os.path.join(path_name, "bwa", base_name)
    # now check if index alredy build or not
    if not os.path.exists("{}.bwt".format(reference)):
        # then create the index sequence
        bwa_stdOut = open("bwa_index.stdOut", "w")
        bwa_stdErr = open("bwa_index.stdErr", "w")
        command = [program, "index", reference]
        common.print_command(command)
        if not common.check_dryrun(sample_config):
            returnValue = subprocess.call(command, stdout=bwa_stdOut,
                    stderr=bwa_stdErr)
            if  not returnValue == 0:
                sys.exit("error, while indexing reference file {} "
                        "with bwa index".format(reference))
    #extra control to avoid problem with unexpected return value
    if not os.path.exists("{}.bwt".format(reference)):
        sys.exit("bwa index failed")
    os.chdir(current_dir)
    return reference
Example #11
0
def _run_kmergenie(global_config, sample_config, sorted_libraries_by_insert):
    """Runs kmergenie to establish a recommended kmer size for assembly"""

    maindir = os.getcwd()
    kmerdir = os.path.join(maindir, "kmergenie")
    if not os.path.exists(kmerdir):
        os.makedirs(kmerdir)
    os.chdir(kmerdir)

    #Write a list of input fastq files for kmergenie
    kmer_input = os.path.join(kmerdir,
            "{}kmerinput.txt".format(sample_config.get("output","")))

    program = global_config["Tools"]["kmergenie"]["bin"]
    program_options=global_config["Tools"]["kmergenie"]["options"]
    # Could be useful to add --diploid if sample is highly heterozygous
    if "kmergenie" in sample_config:
        program_options=sample_config["kmergenie"]

    threads = "" # Kmergenie will spawn number_of_cores - 1 threads by default
    if "threads" in sample_config :
        threads = sample_config["threads"]

    cmd_list = [program, kmer_input]
    for option in filter(None, program_options):
        cmd_list.append(option)
    if threads:
        cmd_list.append("-t {}".format(threads))
    command = " ".join(cmd_list)
    common.print_command(command)
    sample_config["commands"] += "\n" + common.get_command_str(command)


    if not common.check_dryrun(sample_config):
        with open(kmer_input, "w") as f:
            for lib, lib_info in sorted_libraries_by_insert:
                f.write(lib_info["pair1"] + "\n")
                f.write(lib_info["pair2"] + "\n")

        stdOut = open("kmergenie.stdOut", "w")
        stdErr = open("kmergenie.stdErr", "w")
        returnValue = subprocess.call(cmd_list, stdout=stdOut, stderr=stdErr)
        if returnValue != 0:
            print("error while running command: {}".format(command))
        else:
            _kmergenie_plot("histograms.dat")
    sample_config["kmergenie"] = kmerdir
    os.chdir(maindir)
    return sample_config
Example #12
0
def _run_kmergenie(global_config, sample_config, sorted_libraries_by_insert):
    """Runs kmergenie to establish a recommended kmer size for assembly"""

    maindir = os.getcwd()
    kmerdir = os.path.join(maindir, "kmergenie")
    if not os.path.exists(kmerdir):
        os.makedirs(kmerdir)
    os.chdir(kmerdir)

    #Write a list of input fastq files for kmergenie
    kmer_input = os.path.join(kmerdir,
            "{}kmerinput.txt".format(sample_config.get("output","")))

    program = global_config["Tools"]["kmergenie"]["bin"]
    program_options=global_config["Tools"]["kmergenie"]["options"]
    # Could be useful to add --diploid if sample is highly heterozygous
    if "kmergenie" in sample_config:
        program_options=sample_config["kmergenie"]

    threads = "" # Kmergenie will spawn number_of_cores - 1 threads by default
    if "threads" in sample_config :
        threads = sample_config["threads"]

    cmd_list = [program, kmer_input]
    for option in filter(None, program_options):
        cmd_list.append(option)
    if threads:
        cmd_list.append("-t {}".format(threads))
    command = " ".join(cmd_list)
    common.print_command(command)
    sample_config["commands"] += "\n" + common.get_command_str(command)


    if not common.check_dryrun(sample_config):
        with open(kmer_input, "w") as f:
            for lib, lib_info in sorted_libraries_by_insert:
                f.write(lib_info["pair1"] + "\n")
                f.write(lib_info["pair2"] + "\n")

        stdOut = open("kmergenie.stdOut", "w")
        stdErr = open("kmergenie.stdErr", "w")
        returnValue = subprocess.call(cmd_list, stdout=stdOut, stderr=stdErr)
        if returnValue != 0:
            print("error while running command: {}".format(command))
        else:
            _kmergenie_plot("histograms.dat")
    sample_config["kmergenie"] = kmerdir
    os.chdir(maindir)
    return sample_config
Example #13
0
def _run_FRC(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    FRCurveFolder = os.path.join(os.getcwd(), "FRCurve")
    if not os.path.exists(FRCurveFolder):
        os.makedirs(FRCurveFolder)
    os.chdir("FRCurve")
    program=global_config["Tools"]["FRC"]["bin"]

    genomeSize = sample_config["genomeSize"]
    reference = sample_config["reference"]
    output = sample_config["output"]
    alignments = sample_config["alignments"]

    peBam = alignments[0][1]
    peInsert = alignments[0][0]
    peMinInsert = int(peInsert - peInsert*0.60)
    peMaxInsert = int(peInsert + peInsert*0.60)
    command = [program, "--pe-sam", peBam, "--pe-max-insert", "5000"]
    if len(alignments) > 1:
        mpBam = alignments[1][1]
        mpInsert = alignments[1][0]
        mpMinInsert = int(mpInsert - mpInsert*0.50)
        mpMaxInsert = int(mpInsert + mpInsert*0.50)
        command += ["--mp-sam", mpBam, "--mp-max-insert", "25000"]
    command += [ "--genome-size", "{}".format(genomeSize), "--output", output]
    common.print_command(command)
    if not common.check_dryrun(sample_config) and not os.path.exists(
            "{}_FRC.png".format(output)):
        stdOut = open("FRC.stdOut", "a")
        stdErr = open("FRC.stdErr", "a")
        returnValue = subprocess.call(command , stdout=stdOut , stderr=stdErr)
        if not returnValue == 0:
            sys.exit("error, while running FRCurve: {}".format(command))
        plotFRCurve(output)
    os.chdir("..")
    return sample_config
Example #14
0
def _run_FRC(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    FRCurveFolder = os.path.join(os.getcwd(), "FRCurve")
    if not os.path.exists(FRCurveFolder):
        os.makedirs(FRCurveFolder)
    os.chdir("FRCurve")
    program = global_config["Tools"]["FRC"]["bin"]

    genomeSize = sample_config["genomeSize"]
    reference = sample_config["reference"]
    output = sample_config["output"]
    alignments = sample_config["alignments"]

    peBam = alignments[0][1]
    peInsert = alignments[0][0]
    peMinInsert = int(peInsert - peInsert * 0.60)
    peMaxInsert = int(peInsert + peInsert * 0.60)
    command = [program, "--pe-sam", peBam, "--pe-max-insert", "5000"]
    if len(alignments) > 1:
        mpBam = alignments[1][1]
        mpInsert = alignments[1][0]
        mpMinInsert = int(mpInsert - mpInsert * 0.50)
        mpMaxInsert = int(mpInsert + mpInsert * 0.50)
        command += ["--mp-sam", mpBam, "--mp-max-insert", "25000"]
    command += ["--genome-size", "{}".format(genomeSize), "--output", output]
    common.print_command(command)
    if not common.check_dryrun(sample_config) and not os.path.exists(
            "{}_FRC.png".format(output)):
        stdOut = open("FRC.stdOut", "a")
        stdErr = open("FRC.stdErr", "a")
        returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr)
        if not returnValue == 0:
            sys.exit("error, while running FRCurve: {}".format(command))
        plotFRCurve(output)
    os.chdir("..")
    return sample_config
Example #15
0
def _run_allpaths(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "allpaths"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in abyss case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    if _prepare_folder_structure("allpaths", assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    inGroups_file = open("in_groups.csv", "w")
    inLibs_file = open("in_libs.csv", "w")
    inGroups_file.write("group_name, library_name, file_name\n")
    inLibs_file.write(
        "library_name, project_name, organism_name, type, "
        "paired, frag_size, frag_stddev, insert_size, insert_stddev, "
        "read_orientation,genomic_start, genomic_end\n")
    librariesForInLibs = []
    librariesForInLibsDict = {}
    group_name = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation == "innie":
            path, fqfile = os.path.split(read1)
            if "_1.fastq" in fqfile:
                fqfile = fqfile.replace("_1.fastq", "_?.fastq")
            elif "_R1_" in fqfile:
                fqfile = fqfile.replace("_R1_", "_R?_")
            else:
                print("error file format not supported {}".format(fqfile))
                return sample_config
            inGroups_file.write("PE{}, lib{}, {}\n".format(
                group_name, insert, os.path.join(path, fqfile)))
            group_name += 1
            if insert not in librariesForInLibsDict:
                librariesForInLibsDict[insert] = insert
                librariesForInLibs.append(
                    "lib{}, genome, genome, fragment, 1, "
                    "{}, {}, , , inward, 0, 0\n".format(insert, insert, std))
        elif orientation == "outtie":
            path, fqfile = os.path.split(read1)
            if "_1.fastq" in fqfile:
                fqfile = fqfile.replace("_1.fastq", "_?.fastq")
            elif "_R1_" in fqfile:
                fqfile = fqfile.replace("_R1_", "_R?_")
            else:
                print("error file format not supported {}".format(file))
                return sample_config
            inGroups_file.write("MP{}, lib{}, {}\n".format(
                group_name, insert, os.path.join(path, fqfile)))
            group_name += 1
            if insert not in librariesForInLibsDict:
                librariesForInLibsDict[insert] = insert
                librariesForInLibs.append(
                    "lib{}, genome, genome, fragment, 1, "
                    ", , {}, {}, outward, 0, 0\n".format(insert, insert, std))
        else:
            print("all paths support only innies and outties")
    inGroups_file.close()
    for lib in librariesForInLibs:
        inLibs_file.write(lib)
    inLibs_file.close()
    #NOW RUN ALLPATHS FOR REAL
    program = os.path.join(programBIN, "PrepareAllPathsInputs.pl")
    os.mkdir("data_dir")
    data_dir = os.path.join(assemblyDirectory, "data_dir")
    ploidy = "PLOIDY=1"
    if len(program_options) > 0:
        if len(program_options) > 1:
            print("Running ALlpaths only one parameter accepted as option",
                  "here: PLOIDY=2")
            return sample_config
        if program_options[0] == "PLOIDY=2":
            ploidy = "PLOIDY=2"
        else:
            print("Running ALlpaths only one parameter accepted as option",
                  "here: PLOIDY=2")
            return sample_config

    command = [
        program, "DATA_DIR={}".format(data_dir), ploidy,
        "PICARD_TOOLS_DIR={}".format(global_config["Tools"]["picard"]["bin"]),
        "FORCE_PHRED=True", "PHRED_64=False", "IN_GROUPS_CSV={}".format(
            os.path.join(assemblyDirectory, "in_groups.csv")),
        "IN_LIBS_CSV={}".format(os.path.join(assemblyDirectory, "in_libs.csv"))
    ]
    if common.check_dryrun(sample_config):
        common.print_command(command)
        program = os.path.join(programBIN, "RunAllPathsLG")
        command = [
            program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.",
            "DATA_SUBDIR=data_dir", "RUN=allpaths", "SUBDIR=run"
        ]
        common.print_command(command)
        os.chdir("..")
        return sample_config
    assembler_stdOut = open("allpaths_PrepareAllPathsInputs.stdOut", "w")
    assembler_stdErr = open("allpaths_PrepareAllPathsInputs.stdErr", "w")
    common.print_command(command)
    returnValue = subprocess.call(command,
                                  stdout=assembler_stdOut,
                                  stderr=assembler_stdErr)
    assembler_stdOut.close()
    assembler_stdErr.close()
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        program = os.path.join(programBIN, "RunAllPathsLG")
        command = [
            program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.",
            "DATA_SUBDIR=data_dir", "RUN=allpaths", "SUBDIR=run",
            "HAPLOIDIFY=True"
        ]
        common.print_command(command)
        assembler_stdOut = open("allpaths_RunAllPathsLG.stdOut", "w")
        assembler_stdErr = open("allpaths_RunAllPathsLG.stdErr", "w")
        returnValue = subprocess.call(command,
                                      stdout=assembler_stdOut,
                                      stderr=assembler_stdErr)
        if returnValue != 0:
            print("ALLPATHS RunAllPathsLG terminated with an error. Please",
                  "check running folder for more informations")
            os.chdir("..")
            return sample_config
        else:  # save results
            assembly_dir = os.path.join("data_dir", "allpaths", "ASSEMBLIES",
                                        "run")
            if os.path.exists(
                    os.path.join(assembly_dir, "final.assembly.fasta")):
                exit_code = subprocess.call([
                    "cp",
                    os.path.join(assembly_dir, "final.contigs.fasta"),
                    "{}.ctg.fasta".format(outputName)
                ])
                exit_code += subprocess.call([
                    "cp",
                    os.path.join(assembly_dir, "final.assembly.fasta"),
                    "{}.scf.fasta".format(outputName)
                ])
                if not "keep_tmp_files" in flags and exit_code == 0:
                    subprocess.call(["rm", "-r", "data_dir"])
            else:
                print(
                    "something wrong with Allpaths > no contig file generated")
                os.chdir("..")
                return sample_config
    else:
        print("ALLPATHS PrepareAllPathInputs terminated with an error. "
              "Please check running folder for more informations")
        os.chdir("..")
        return sample_config
    os.chdir("..")
    return sample_config
Example #16
0
def _run_qaTools(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    qaToolsFolder = os.path.join(os.getcwd(), "QAstats")
    if not os.path.exists(qaToolsFolder):
        os.makedirs(qaToolsFolder)
    os.chdir("QAstats")
    program=global_config["Tools"]["qaTools"]["bin"]

    genomeSize = sample_config["genomeSize"]
    reference = sample_config["reference"]
    output = sample_config["output"]
    alignments = sample_config["alignments"][0]
    BAMfile = alignments[1]

    command = ["{}".format(program),  "-m",  "-q", "0", "-i",  BAMfile,
            "{}.cov".format(os.path.basename(BAMfile))]
    common.print_command(command)
    if not common.check_dryrun(sample_config) and not os.path.exists(
            "{}.cov".format(os.path.basename(BAMfile))):
        stdOut = open("QAtools.stdOut", "a")
        stdErr = open("QAtools.stdErr", "a")
        returnValue = subprocess.call(command , stdout=stdOut , stderr=stdErr)
        if not returnValue == 0:
            sys.exit("error, while running QAtools: {}".format(command))
        #now add GC content
        QAtools_dict = {}
        header = ""
        with open( "{}.cov".format(os.path.basename(BAMfile)), "r") as QA_csv:
            header = QA_csv.readline().rstrip()
            for line in QA_csv:
                line = line.strip().split("\t")
                QAtools_dict[line[0]] = [line[1],line[2],line[3]]
        QA_GC_file = "{}.cov.gc".format(os.path.basename(BAMfile))
        with open(QA_GC_file, "w") as QA_GC_fd:
            QA_GC_fd.write("{}\tGCperc\n".format(header))
            with open(reference, "r") as ref_fd:
                fasta_raw_header = ref_fd.readline().strip()
                fasta_raw_header = fasta_raw_header.split(" ")[0]
                fasta_raw_header = fasta_raw_header.split("\t")[0]
                fasta_header = fasta_raw_header.split(">")[1]
                sequence = ""
                for line in ref_fd:
                    line = line.strip()
                    if line.startswith(">"):
                        GC = computeGC(sequence)
                        if fasta_header not in QAtools_dict:
                            sys.exit("error while parsing QAcompute output: "
                                    "probably some wired contig name is "
                                    "present in your assmebly file")
                        QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format(
                            fasta_header, QAtools_dict[fasta_header][0],
                            QAtools_dict[fasta_header][1],
                            QAtools_dict[fasta_header][2], GC))
                        sequence = ""
                        fasta_raw_header = line.split(" ")[0]
                        fasta_raw_header = fasta_raw_header.split("\t")[0]
                        fasta_header = fasta_raw_header.split(">")[1]
                    else:
                        sequence+=line
                GC = computeGC(sequence)
                if fasta_header not in QAtools_dict:
                    sys.exit("error while parsing QAcompute output: probably "
                            "some wired contig name is present in your "
                            "assmebly file")
                QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format(fasta_header,
                    QAtools_dict[fasta_header][0],
                    QAtools_dict[fasta_header][1],
                    QAtools_dict[fasta_header][2], GC))
        plotQA(QA_GC_file)
    os.chdir("..")
    return sample_config
Example #17
0
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "abyss"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in abyss case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    if _prepare_folder_structure("abyss", assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART
    assembler_stdOut = open("abyss.stdOut", "a")
    assembler_stdErr = open("abyss.stdErr", "a")
    program = os.path.join(programBIN, "abyss-pe")

    command = ""
    command += "{} ".format(program)
    threads = 8  # default for UPPMAX
    if "threads" in sample_config:
        threads = sample_config["threads"]
    command += "np={} ".format(threads)
    kmer = 54
    if "kmer" in sample_config:
        kmer = sample_config["kmer"]
    command += "k={} ".format(kmer)

    libraries = {}
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation == "innie" or orientation == "none":
            if read2 is None:
                # check if this is the first time I insert a se file
                if "se" not in libraries:
                    libraries["se"] = "se=\'"
                libraries["se"] = libraries["se"] + read1
            else:
                if not "lib" in libraries:
                    libraries["lib"] = {}
                libName = insert  # lib name is the insert size
                if not libName in libraries["lib"]:
                    libraries["lib"][libName] = ""
                libraries["lib"][libName] += "{} {} ".format(read1, read2)
        else:
            if not "mp" in libraries:
                libraries["mp"] = {}
            libName = format(insert)
            if not libName in libraries["mp"]:
                libraries["mp"][libName] = ""
            libraries["mp"][libName] += "{} {} ".format(read1, read2)
    #now create the command
    command += "name={} ".format(outputName)
    librariesSE = ""
    librariesPE = ""
    librariesMP = ""
    if "se" in libraries:
        libraries["se"] = libraries["se"] + "\'"
        librariesSE = libraries["se"]
    if "lib" in libraries:
        lib = "lib=\'"
        for libPE, libPEreads in sorted(libraries["lib"].items()):
            lib = lib + "lib{} ".format(libPE)
            librariesPE += " lib{}=\'{}\' ".format(libPE, libPEreads)
        lib = lib + "\' "
        command += "{} ".format(lib)
    if "mp" in libraries:
        mp = "mp=\'"
        for libMP, libMPreads in sorted(libraries["mp"].items()):
            mp = mp + "lib{} ".format(libMP)
            librariesMP += " lib{}=\'{}\' ".format(libMP, libMPreads)
        mp = mp + "\' "
        command += "{} ".format(mp)

    command += "{} ".format(librariesSE)
    command += "{} ".format(librariesPE)
    command += "{} ".format(librariesMP)

    common.print_command(command)
    if common.check_dryrun(sample_config):
        os.chdir("..")
        return sample_config

    os.makedirs(os.path.join(assemblyDirectory, "runABySS"))
    os.chdir("runABySS")
    returnValue = 0
    returnValue = subprocess.call(command,
                                  stdout=assembler_stdOut,
                                  stderr=assembler_stdErr,
                                  shell=True)
    os.chdir("..")
    flags = sample_config.get("flags", [])
    if returnValue == 0 and not common.check_dryrun(sample_config):
        if os.path.exists(
                os.path.join("runABySS", "{}-contigs.fa".format(outputName))):
            subprocess.call([
                "cp",
                os.path.join("runABySS", "{}-contigs.fa".format(outputName)),
                "{}.ctg.fasta".format(outputName)
            ])
            subprocess.call([
                "cp",
                os.path.join("runABySS", "{}-scaffolds.fa".format(outputName)),
                "{}.scf.fasta".format(outputName)
            ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runABySS"])
        elif not common.check_dryrun(sample_config):
            print("something wrong with ABySS -> no contig file generated")
            return sample_config
    else:
        print("ABySS terminated with an error. Please check running folder",
              "for more informations")
    os.chdir("..")
    return sample_config
Example #18
0
def _run_spades(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "spades"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART

    command = ""
    command += "{} ".format(programBIN)
    for option in program_options:
        command += "{} ".format(option)

    #creates the command on-the-fly
    peLibrary = 1
    mpLibrary = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation == "innie" or orientation == "none":
            if read2 is None:
                command += "--pe{}-s {} ".format(peLibrary, read1)
            else:
                command += "--pe{}-1 {} --pe{}-2 {} ".format(
                    peLibrary, read1, peLibrary, read2)
            peLibrary += 1
        elif orientation == "outtie":
            command += "--mp{}-1 {} --mp{}-2 {} ".format(
                mpLibrary, read1, mpLibrary, read2)
            mpLibrary += 1
        else:
            print("orientation{} not supported.... why the program did not",
                  "failed earlier?".format(orientation))

    command += "-o {} ".format(outputName)
    common.print_command(command)
    returnValue = 0
    if not common.check_dryrun(sample_config):
        assembler_stdOut = open("spades.stdOut", "a")
        assembler_stdErr = open("spades.stdErr", "a")
        returnValue = subprocess.call(command,
                                      stdout=assembler_stdOut,
                                      stderr=assembler_stdErr,
                                      shell=True)
    else:
        return sample_config

    flags = sample_config.get("flags", [])
    if returnValue == 0:
        if os.path.exists(os.path.join(outputName, "contigs.fasta")):
            subprocess.call([
                "cp",
                os.path.join(outputName, "contigs.fasta"),
                "{}.ctg.fasta".format(outputName)
            ])
            subprocess.call([
                "cp",
                os.path.join(outputName, "scaffolds.fasta"),
                "{}.scf.fasta".format(outputName)
            ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", outputName])
        else:
            print("something wrong with SPADES -> no contig file generated")
    else:
        print("SPADES terminated with an error. Please check running folder",
              "for more informations")

    os.chdir("..")
    return sample_config
Example #19
0
def align_bwa_mem(global_config, read1, read2, reference, threads, dryrun):
    aligner = "bwa"
    if "bwa" in global_config["Tools"]:
        aligner = global_config["Tools"]["bwa"]["bin"]
    elif not common.which("bwa"):
        sys.exit("error while trying to run  bwa mem: bwa not present in the "
                "path and not in global config, please make sure to install "
                "bwa properly")

    samtools = "samtools"
    if "samtools" in global_config["Tools"]:
        samtools = global_config["Tools"]["samtools"]["bin"]
    elif not common.which("samtools"):
        sys.exit("error while trying to run  samtools: bwa not present in the "
                "path and not in global config, please make sure to install "
                "bwa properly")

    # extract base name
    libraryBase = ""
    if read2:
        libraryBase = os.path.basename(read1).split("_1.fastq")[0]
    else:
        libraryBase = os.path.basename(read1).split(".fastq")[0]

    if not os.path.exists(libraryBase):
        os.makedirs(libraryBase)
    os.chdir(libraryBase)
    mappingBase = "{}_to_{}".format(libraryBase,
            os.path.basename(reference).split(".fasta")[0])
    BAMsorted   = "{}.bam".format(mappingBase)
    BAMunsorted = "{}.unsorted.bam".format(mappingBase)
    SAMMapped   = "{}.unsorted.sam".format(mappingBase)
    if os.path.exists(os.path.abspath(BAMsorted)):
        BAMsorted = os.path.abspath(BAMsorted)
        os.chdir("..")
        return BAMsorted


    bwa_mem_command = [aligner, "mem", "-M", "-t", "{}".format(threads),
            reference, read1, read2]
    samtools_view_command = [samtools, "view", "-b", "-S",  "-u",  "-"]

    if not os.path.exists(BAMunsorted):
        command = "{} | {} > {}".format(" ".join(bwa_mem_command),
                " ".join(samtools_view_command), BAMunsorted)
        bwa_stdOut       = open("bwa.stdOut", "w")
        bwa_stdErr       = open("bwa.stdErr", "w")
        common.print_command(command)
        if not dryrun:
            subprocess.call(command, shell=True, stdout=bwa_stdOut,
                    stderr=bwa_stdErr)

    samtools_sort_command = [samtools, "sort", "-@", "{}".format(threads),
            "-m" , "1G", BAMunsorted,  mappingBase]
    command = " ".join(samtools_sort_command)
    if not os.path.exists(BAMsorted):
        stdOut       = open("sam_sort.stdOut", "w")
        stdErr       = open("sam_sort.stdErr", "w")
        common.print_command(command)
        if not dryrun:
            subprocess.call(command, shell=True, stdout=stdOut, stderr=stdErr)

    if os.path.exists(BAMsorted) and os.path.exists(BAMunsorted):
        subprocess.call(["rm", BAMunsorted])
    BAMsorted = os.path.abspath(BAMsorted)
    os.chdir("..")
    return BAMsorted
Example #20
0
def _run_cabog(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "cabog"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART
    sys.path.insert(0, programBIN)
    libraries = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        command_fastqToCA = os.path.join(programBIN, "fastqToCA")
        read1=libraryInfo["pair1"]
        read2=libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        command_fastqToCA += " -libraryname "
        command_fastqToCA += " {}_{}".format(outputName, libraries)
        command_fastqToCA += " -insertsize "
        command_fastqToCA += " {} {} ".format(insert,std)
        command_fastqToCA += " -technology "
        command_fastqToCA += " illumina "
        command_fastqToCA += " -type "
        command_fastqToCA += " illumina "
        if orientation=="innie" or orientation=="none" :
            command_fastqToCA += " -innie "
            if read2 is None:
                command_fastqToCA += " -reads "
                command_fastqToCA += " {} ".format(read1)
            else:
                command_fastqToCA += " -mates "
                command_fastqToCA += " {},{} ".format(read1, read2)
        elif orientation=="outtie":
            command_fastqToCA += " -outtie "
            command_fastqToCA += " -mates "
            command_fastqToCA += " {},{} ".format(read1, read2)
        command_fastqToCA += " > "
        command_fastqToCA += " {}_{}.frg ".format(outputName, libraries)

        common.print_command(command_fastqToCA)
        if not common.check_dryrun(sample_config):
            cabog_stdOut = open("cabog_fastqToCA.stdOut", "w")
            cabog_stdErr = open("cabogfastqToCA.stdErr", "w")
            subprocess.call(command_fastqToCA, stderr=cabog_stdErr, shell=True)
            cabog_stdOut.close()
            cabog_stdErr.close()
        libraries += 1
    command_runCA = os.path.join(programBIN, "runCA")
    command_runCA += "  -d runCABOGfolder -p {} *frg".format(outputName)
    common.print_command(command_runCA)
    if common.check_dryrun(sample_config):
        return sample_config
    returnValue = 0
    cabog_stdOut = open("cabog_runCA.stdOut", "w")
    cabog_stdErr = open("cabog_runCA.stdErr", "w")
    returnValue = subprocess.call(command_runCA, stdout=cabog_stdOut,
            stderr=cabog_stdErr, shell=True)
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        #assembly succed, remove files and save assembly
        if os.path.exists(os.path.join("runCABOGfolder","9-terminator",
            "{}.ctg.fasta".format(outputName))):
            subprocess.call(["cp", os.path.join("runCABOGfolder","9-terminator",
                "{}.ctg.fasta".format(outputName)),
                "{}.ctg.fasta".format(outputName)])
            subprocess.call(["cp", os.path.join("runCABOGfolder","9-terminator",
                "{}.scf.fasta".format(outputName)),
                "{}.scf.fasta".format(outputName)])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runCABOGfolder"])
        else:
            print("something wrong with CABOG -> no contig file generated")
    else:
        print("CABOG terminated with an error. Please check running folder",
                "for more informations")
    os.chdir("..")
    return sample_config
Example #21
0
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler                  = "abyss"
    outputName                 = sample_config["output"]
    currentDirectory           = os.getcwd()
    assemblyDirectory          = os.path.join(currentDirectory, assembler)
    # in abyss case there is no exectuable
    programBIN                 = global_config["Tools"][assembler]["bin"]
    program_options            = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
    if _prepare_folder_structure("abyss", assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART
    assembler_stdOut = open("abyss.stdOut", "a")
    assembler_stdErr = open("abyss.stdErr", "a")
    program=os.path.join(programBIN, "abyss-pe")

    command = ""
    command += "{} ".format(program)
    threads = 8 # default for UPPMAX
    if "threads" in sample_config :
        threads = sample_config["threads"]
    command += "np={} ".format(threads)
    kmer = 54
    if "kmer" in sample_config:
        kmer = sample_config["kmer"]
    command += "k={} ".format(kmer)

    libraries = {}
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation=="innie" or orientation=="none":
            if read2 is None:
                # check if this is the first time I insert a se file
                if "se" not in libraries:
                    libraries["se"] = "se=\'"
                libraries["se"] = libraries["se"] + read1
            else:
                if not "lib" in libraries:
                    libraries["lib"] = {}
                libName = insert # lib name is the insert size
                if not libName in libraries["lib"]:
                    libraries["lib"][libName] = ""
                libraries["lib"][libName] +=  "{} {} ".format(read1, read2)
        else:
            if not "mp" in libraries:
                libraries["mp"] = {}
            libName = format(insert)
            if not libName in libraries["mp"]:
                libraries["mp"][libName] = ""
            libraries["mp"][libName] +=  "{} {} ".format(read1, read2)
    #now create the command
    command += "name={} ".format(outputName)
    librariesSE = ""
    librariesPE = ""
    librariesMP = ""
    if "se" in libraries:
        libraries["se"] = libraries["se"] + "\'"
        librariesSE = libraries["se"]
    if "lib" in libraries:
        lib="lib=\'"
        for libPE, libPEreads in sorted(libraries["lib"].items()):
            lib = lib + "lib{} ".format(libPE)
            librariesPE += " lib{}=\'{}\' ".format(libPE,libPEreads)
        lib=lib + "\' "
        command += "{} ".format(lib)
    if "mp" in libraries:
        mp="mp=\'"
        for libMP, libMPreads in sorted(libraries["mp"].items()):
            mp = mp + "lib{} ".format(libMP)
            librariesMP += " lib{}=\'{}\' ".format(libMP,libMPreads)
        mp=mp + "\' "
        command += "{} ".format(mp)

    command += "{} ".format(librariesSE)
    command += "{} ".format(librariesPE)
    command += "{} ".format(librariesMP)

    common.print_command(command)
    if common.check_dryrun(sample_config):
        os.chdir("..")
        return sample_config

    os.makedirs(os.path.join(assemblyDirectory, "runABySS"))
    os.chdir("runABySS")
    returnValue = 0
    returnValue = subprocess.call(command, stdout=assembler_stdOut,
            stderr=assembler_stdErr, shell=True)
    os.chdir("..")
    flags = sample_config.get("flags", [])
    if returnValue == 0 and not common.check_dryrun(sample_config):
        if os.path.exists(os.path.join("runABySS","{}-contigs.fa".format(
            outputName))):
            subprocess.call(["cp", os.path.join("runABySS",
                "{}-contigs.fa".format(outputName)),
                "{}.ctg.fasta".format(outputName) ])
            subprocess.call(["cp", os.path.join("runABySS",
                "{}-scaffolds.fa".format(outputName)),
                "{}.scf.fasta".format(outputName) ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runABySS"])
        elif not common.check_dryrun(sample_config):
            print("something wrong with ABySS -> no contig file generated")
            return sample_config
    else:
        print("ABySS terminated with an error. Please check running folder",
                "for more informations")
    os.chdir("..")
    return sample_config
Example #22
0
def _run_cabog(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "cabog"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART
    sys.path.insert(0, programBIN)
    libraries = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        command_fastqToCA = os.path.join(programBIN, "fastqToCA")
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        command_fastqToCA += " -libraryname "
        command_fastqToCA += " {}_{}".format(outputName, libraries)
        command_fastqToCA += " -insertsize "
        command_fastqToCA += " {} {} ".format(insert, std)
        command_fastqToCA += " -technology "
        command_fastqToCA += " illumina "
        command_fastqToCA += " -type "
        command_fastqToCA += " illumina "
        if orientation == "innie" or orientation == "none":
            command_fastqToCA += " -innie "
            if read2 is None:
                command_fastqToCA += " -reads "
                command_fastqToCA += " {} ".format(read1)
            else:
                command_fastqToCA += " -mates "
                command_fastqToCA += " {},{} ".format(read1, read2)
        elif orientation == "outtie":
            command_fastqToCA += " -outtie "
            command_fastqToCA += " -mates "
            command_fastqToCA += " {},{} ".format(read1, read2)
        command_fastqToCA += " > "
        command_fastqToCA += " {}_{}.frg ".format(outputName, libraries)

        common.print_command(command_fastqToCA)
        if not common.check_dryrun(sample_config):
            cabog_stdOut = open("cabog_fastqToCA.stdOut", "w")
            cabog_stdErr = open("cabogfastqToCA.stdErr", "w")
            subprocess.call(command_fastqToCA, stderr=cabog_stdErr, shell=True)
            cabog_stdOut.close()
            cabog_stdErr.close()
        libraries += 1
    command_runCA = os.path.join(programBIN, "runCA")
    command_runCA += "  -d runCABOGfolder -p {} *frg".format(outputName)
    common.print_command(command_runCA)
    if common.check_dryrun(sample_config):
        return sample_config
    returnValue = 0
    cabog_stdOut = open("cabog_runCA.stdOut", "w")
    cabog_stdErr = open("cabog_runCA.stdErr", "w")
    returnValue = subprocess.call(command_runCA,
                                  stdout=cabog_stdOut,
                                  stderr=cabog_stdErr,
                                  shell=True)
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        #assembly succed, remove files and save assembly
        if os.path.exists(
                os.path.join("runCABOGfolder", "9-terminator",
                             "{}.ctg.fasta".format(outputName))):
            subprocess.call([
                "cp",
                os.path.join("runCABOGfolder", "9-terminator",
                             "{}.ctg.fasta".format(outputName)),
                "{}.ctg.fasta".format(outputName)
            ])
            subprocess.call([
                "cp",
                os.path.join("runCABOGfolder", "9-terminator",
                             "{}.scf.fasta".format(outputName)),
                "{}.scf.fasta".format(outputName)
            ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runCABOGfolder"])
        else:
            print("something wrong with CABOG -> no contig file generated")
    else:
        print("CABOG terminated with an error. Please check running folder",
              "for more informations")
    os.chdir("..")
    return sample_config
Example #23
0
def _run_qaTools(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    qaToolsFolder = os.path.join(os.getcwd(), "QAstats")
    if not os.path.exists(qaToolsFolder):
        os.makedirs(qaToolsFolder)
    os.chdir("QAstats")
    program = global_config["Tools"]["qaTools"]["bin"]

    genomeSize = sample_config["genomeSize"]
    reference = sample_config["reference"]
    output = sample_config["output"]
    alignments = sample_config["alignments"][0]
    BAMfile = alignments[1]

    command = [
        "{}".format(program), "-m", "-q", "0", "-i", BAMfile,
        "{}.cov".format(os.path.basename(BAMfile))
    ]
    common.print_command(command)
    if not common.check_dryrun(sample_config) and not os.path.exists(
            "{}.cov".format(os.path.basename(BAMfile))):
        stdOut = open("QAtools.stdOut", "a")
        stdErr = open("QAtools.stdErr", "a")
        returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr)
        if not returnValue == 0:
            sys.exit("error, while running QAtools: {}".format(command))
        #now add GC content
        QAtools_dict = {}
        header = ""
        with open("{}.cov".format(os.path.basename(BAMfile)), "r") as QA_csv:
            header = QA_csv.readline().rstrip()
            for line in QA_csv:
                line = line.strip().split("\t")
                QAtools_dict[line[0]] = [line[1], line[2], line[3]]
        QA_GC_file = "{}.cov.gc".format(os.path.basename(BAMfile))
        with open(QA_GC_file, "w") as QA_GC_fd:
            QA_GC_fd.write("{}\tGCperc\n".format(header))
            with open(reference, "r") as ref_fd:
                fasta_raw_header = ref_fd.readline().strip()
                fasta_raw_header = fasta_raw_header.split(" ")[0]
                fasta_raw_header = fasta_raw_header.split("\t")[0]
                fasta_header = fasta_raw_header.split(">")[1]
                sequence = ""
                for line in ref_fd:
                    line = line.strip()
                    if line.startswith(">"):
                        GC = computeGC(sequence)
                        if fasta_header not in QAtools_dict:
                            sys.exit("error while parsing QAcompute output: "
                                     "probably some wired contig name is "
                                     "present in your assmebly file")
                        QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format(
                            fasta_header, QAtools_dict[fasta_header][0],
                            QAtools_dict[fasta_header][1],
                            QAtools_dict[fasta_header][2], GC))
                        sequence = ""
                        fasta_raw_header = line.split(" ")[0]
                        fasta_raw_header = fasta_raw_header.split("\t")[0]
                        fasta_header = fasta_raw_header.split(">")[1]
                    else:
                        sequence += line
                GC = computeGC(sequence)
                if fasta_header not in QAtools_dict:
                    sys.exit("error while parsing QAcompute output: probably "
                             "some wired contig name is present in your "
                             "assmebly file")
                QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format(
                    fasta_header, QAtools_dict[fasta_header][0],
                    QAtools_dict[fasta_header][1],
                    QAtools_dict[fasta_header][2], GC))
        plotQA(QA_GC_file)
    os.chdir("..")
    return sample_config
Example #24
0
def _run_masurca(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "masurca"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART

    masurca_config_file = open("configuration.txt", "w")
    masurca_config_file.write("DATA\n")
    allTheLetters = string.lowercase
    libraryPE = "p"
    libraryPEnum = 0
    libraryMP = "m"
    libraryMPnum = 0
    #TODO: single ended reads
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation == "innie":
            if read2 is not None:
                configurationLine = "PE = {}{} {} {} {} {}".format(
                    libraryPE, allTheLetters[libraryPEnum], insert, std, read1,
                    read2)
                masurca_config_file.write("{}\n".format(configurationLine))
                libraryPEnum += 1
                #TODO: check when more than 21 PE libraries ae specified
        elif orientation == "outtie":
            configurationLine = "JUMP = {}{} {} {} {} {}".format(
                libraryMP, allTheLetters[libraryMPnum], insert, std, read1,
                read2)
            masurca_config_file.write("{}\n".format(configurationLine))
            libraryMPnum += 1
            #TODO: check when more than 21 PE libraries ae specified
    masurca_config_file.write("END\n")

    masurca_config_file.write("\n")

    masurca_config_file.write("PARAMETERS\n")
    #this is k-mer size for deBruijn graph values between 25 and 101 are
    #supported, auto will compute the optimal size based on the read data
    #and GC content
    masurca_config_file.write("GRAPH_KMER_SIZE=auto\n")
    #set this to 1 for Illumina-only assemblies and to 0 if you have 2x or
    #more long (Sanger, 454) reads
    masurca_config_file.write("USE_LINKING_MATES=1\n")
    #this parameter is useful if you have too many jumping library mates.
    #See manual for explanation about settings based on genome length
    if sample_config["genomeSize"] > 10000000:
        masurca_config_file.write("LIMIT_JUMP_COVERAGE = 1000\n")
    else:
        masurca_config_file.write("LIMIT_JUMP_COVERAGE = 60\n")
    #these are the additional parameters to Celera Assembler.  do not worry
    #about performance, number or processors or batch sizes -- these are
    #computed automatically. for mammals do not set cgwErrorRate above 0.15!!!
    if sample_config["genomeSize"] > 1500000000:
        masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \
                cgwErrorRate=0.15 ovlMemory=4GB\n")
    else:
        masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \
                cgwErrorRate=0.25 ovlMemory=4GB\n")
    #auto-detected number of cpus to use
    threads = 8  # default for UPPMAX
    if "threads" in sample_config:
        threads = sample_config["threads"]
    masurca_config_file.write("NUM_THREADS= {}\n".format(threads))
    #this is mandatory jellyfish hash size ---- jellyfish hash size,
    #set this to about 10x the genome size.
    JF_SIZE = sample_config["genomeSize"] * 11
    masurca_config_file.write("JF_SIZE={}\n".format(JF_SIZE))
    #this specifies if we do (1) or do not (0) want to trim long runs of
    #homopolymers (e.g. GGGGGGGG) from 3' read ends, use it for high GC genomes
    masurca_config_file.write("DO_HOMOPOLYMER_TRIM=0\n")
    masurca_config_file.write("END\n")
    masurca_config_file.write("\n")

    masurca_config_file.close()

    if common.check_dryrun(sample_config):
        os.chdir("..")
        return sample_config

    masurca_stdOut = open("masurca.stdOut", "w")
    masurca_stdErr = open("masurca.stdErr", "w")
    os.mkdir("runMASURCA")
    os.chdir("runMASURCA")
    command = [os.path.join(programBIN, "bin/masurca"), "../configuration.txt"]
    common.print_command(command)

    subprocess.call(command, stdout=masurca_stdOut, stderr=masurca_stdErr)
    if not os.path.exists("assemble.sh"):
        print("MaSuRCA: assemble.sh not created. Unknown failure")
        return sample_config
    command = ["./assemble.sh"]
    common.print_command(command)
    returnValue = subprocess.call(command,
                                  stdout=masurca_stdOut,
                                  stderr=masurca_stdErr)
    os.chdir("..")
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        if os.path.exists(
                os.path.join("runMASURCA", "CA/10-gapclose/genome.scf.fasta")):
            subprocess.call([
                "cp",
                os.path.join("runMASURCA", "CA/10-gapclose/genome.ctg.fasta"),
                "{}.ctg.fasta".format(outputName)
            ])
            subprocess.call([
                "cp",
                os.path.join("runMASURCA", "CA/10-gapclose/genome.scf.fasta"),
                "{}.scf.fasta".format(outputName)
            ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runMASURCA"])
        else:
            print("something wrong with MaSuRCA -> no contig file generated")
    else:
        print("MaSuRCA terminated with an error. Please check running folder",
              "for more informations")
        return sample_config
    os.chdir("..")
    return sample_config
Example #25
0
def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert):
    program = global_config["Tools"]["trimmomatic"]["bin"]
    program_folder = os.path.dirname(program)
    if "adapters" not in sample_config:
        sys.exit("running MP pipeline, adapters file to be used in trimming"
                 "are needed for Trimmomatic. Please specify them"
                 "in the sample configuration file and rerun")
    adapterFile = sample_config["adapters"]
    if not os.path.exists(adapterFile):
        sys.exit("Trimmomatic cannot be run as adapter file is not specified"
                 "or points to unknown position: {}".format(adapterFile))

    mainDirectory = os.getcwd()
    trimmomaticDir = os.path.join(mainDirectory, "Trimmomatic")
    if not os.path.exists(trimmomaticDir):
        os.makedirs(trimmomaticDir)
    os.chdir(trimmomaticDir)
    #now I am in running dir, I need to process one by one the libraries
    threads = 8
    if "threads" in sample_config:
        threads = sample_config["threads"]

    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        if read2 is not None:
            read1_baseName = os.path.split(read1)[1].split(".")[0]
            read2_baseName = os.path.split(read2)[1].split(".")[0]
            output_read1_pair = os.path.join(
                trimmomaticDir, "{}.fastq.gz".format(read1_baseName))
            output_read1_sing = os.path.join(
                trimmomaticDir, "{}_u.fastq.gz".format(read1_baseName))
            output_read2_pair = os.path.join(
                trimmomaticDir, "{}.fastq.gz".format(read2_baseName))
            output_read2_sing = os.path.join(
                trimmomaticDir, "{}_u.fastq.gz".format(read2_baseName))
            command = [
                "java", "-jar", program, "PE", "-threads",
                "{}".format(threads), "-phred33", read1, read2,
                output_read1_pair, output_read1_sing, output_read2_pair,
                output_read2_sing,
                "ILLUMINACLIP:{}:2:30:10".format(adapterFile), "LEADING:3",
                "TRAILING:3", "SLIDINGWINDOW:4:15", "MINLEN:30"
            ]
            common.print_command(command)
            sample_config["commands"] += "\n" + common.get_command_str(command)

            # do not execute is files have been already gennerated
            if not common.check_dryrun(sample_config) and not \
                    os.path.exists(output_read1_pair):
                stdOut = open("{}_trimmomatic.stdOut".format(read1_baseName),
                              "w")
                stdErr = open("{}_trimmomatic.stdErr".format(read1_baseName),
                              "w")
                returnValue = subprocess.call(command,
                                              stdout=stdOut,
                                              stderr=stdErr)  # run the program
                if returnValue != 0:
                    print("error while running command: {}".format(command))
            libraryInfo["pair1"] = output_read1_pair
            libraryInfo["pair2"] = output_read2_pair
            libraryInfo["trimmomatic"] = os.path.join(
                trimmomaticDir, "{}_trimmomatic.stdErr".format(read1_baseName))
    os.chdir(mainDirectory)
    return sample_config
Example #26
0
def _run_masurca(global_config, sample_config,sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "masurca"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART

    masurca_config_file = open("configuration.txt", "w")
    masurca_config_file.write("DATA\n")
    allTheLetters = string.lowercase
    libraryPE    = "p"
    libraryPEnum = 0
    libraryMP    = "m"
    libraryMPnum = 0
    #TODO: single ended reads
    for library, libraryInfo in sorted_libraries_by_insert:
        read1=libraryInfo["pair1"]
        read2=libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation=="innie":
            if read2 is not None:
                configurationLine = "PE = {}{} {} {} {} {}".format(libraryPE,
                        allTheLetters[libraryPEnum], insert, std, read1, read2)
                masurca_config_file.write("{}\n".format(configurationLine))
                libraryPEnum += 1
                #TODO: check when more than 21 PE libraries ae specified
        elif orientation=="outtie":
            configurationLine = "JUMP = {}{} {} {} {} {}".format(libraryMP, 
                    allTheLetters[libraryMPnum], insert, std, read1, read2)
            masurca_config_file.write("{}\n".format(configurationLine))
            libraryMPnum += 1
            #TODO: check when more than 21 PE libraries ae specified
    masurca_config_file.write("END\n")

    masurca_config_file.write("\n")

    masurca_config_file.write("PARAMETERS\n")
    #this is k-mer size for deBruijn graph values between 25 and 101 are 
    #supported, auto will compute the optimal size based on the read data 
    #and GC content
    masurca_config_file.write("GRAPH_KMER_SIZE=auto\n")
    #set this to 1 for Illumina-only assemblies and to 0 if you have 2x or 
    #more long (Sanger, 454) reads
    masurca_config_file.write("USE_LINKING_MATES=1\n")
    #this parameter is useful if you have too many jumping library mates. 
    #See manual for explanation about settings based on genome length
    if sample_config["genomeSize"] > 10000000:
        masurca_config_file.write("LIMIT_JUMP_COVERAGE = 1000\n")
    else:
        masurca_config_file.write("LIMIT_JUMP_COVERAGE = 60\n")
    #these are the additional parameters to Celera Assembler.  do not worry 
    #about performance, number or processors or batch sizes -- these are 
    #computed automatically. for mammals do not set cgwErrorRate above 0.15!!!
    if sample_config["genomeSize"] > 1500000000:
        masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \
                cgwErrorRate=0.15 ovlMemory=4GB\n")
    else:
        masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \
                cgwErrorRate=0.25 ovlMemory=4GB\n")
    #auto-detected number of cpus to use
    threads = 8 # default for UPPMAX
    if "threads" in sample_config :
        threads = sample_config["threads"]
    masurca_config_file.write("NUM_THREADS= {}\n".format(threads))
    #this is mandatory jellyfish hash size ---- jellyfish hash size, 
    #set this to about 10x the genome size.
    JF_SIZE = sample_config["genomeSize"] * 11
    masurca_config_file.write("JF_SIZE={}\n".format(JF_SIZE))
    #this specifies if we do (1) or do not (0) want to trim long runs of 
    #homopolymers (e.g. GGGGGGGG) from 3' read ends, use it for high GC genomes
    masurca_config_file.write("DO_HOMOPOLYMER_TRIM=0\n")
    masurca_config_file.write("END\n")
    masurca_config_file.write("\n")

    masurca_config_file.close()

    if common.check_dryrun(sample_config):
        os.chdir("..")
        return sample_config

    masurca_stdOut = open("masurca.stdOut", "w")
    masurca_stdErr = open("masurca.stdErr", "w")
    os.mkdir("runMASURCA")
    os.chdir("runMASURCA")
    command = [os.path.join(programBIN,"bin/masurca") , "../configuration.txt"]
    common.print_command(command)

    subprocess.call(command, stdout=masurca_stdOut, stderr=masurca_stdErr)
    if not os.path.exists("assemble.sh"):
        print("MaSuRCA: assemble.sh not created. Unknown failure")
        return sample_config
    command = ["./assemble.sh"]
    common.print_command(command)
    returnValue = subprocess.call(command, stdout=masurca_stdOut,
            stderr=masurca_stdErr)
    os.chdir("..")
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        if os.path.exists(os.path.join(
            "runMASURCA","CA/10-gapclose/genome.scf.fasta")):
            subprocess.call(["cp", os.path.join(
                "runMASURCA","CA/10-gapclose/genome.ctg.fasta"), 
                "{}.ctg.fasta".format(outputName) ])
            subprocess.call(["cp", os.path.join(
                "runMASURCA","CA/10-gapclose/genome.scf.fasta"),
                "{}.scf.fasta".format(outputName) ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runMASURCA"])
        else:
            print("something wrong with MaSuRCA -> no contig file generated")
    else:
        print("MaSuRCA terminated with an error. Please check running folder",
                "for more informations")
        return sample_config
    os.chdir("..")
    return sample_config
Example #27
0
def _merge_bam_files(global_config, sample_config, sorted_libraries_by_insert):
    BAMfiles = {};
    reference = sample_config["reference"]

    samtools = "samtools"
    if "samtools" in global_config["Tools"]:
        samtools = global_config["Tools"]["samtools"]["bin"]
    elif not common.which("samtools"):
        sys.exit("error while trying to run  samtools: bwa not present in the "
                "path and not in global config, please make sure to install "
                "bwa properly")

    numInserts = 0
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        alignment = libraryInfo["alignment"]
        if insert not in BAMfiles:
            BAMfiles[insert] = [alignment]
            numInserts += 1
        else:
            BAMfiles[insert].append(alignment)

    BAMfilesMerged = {}
    for insert, insertGroup in BAMfiles.items():
        dir_insert = "lib_{}".format(insert)
        if numInserts == 1:
            dir_insert = sample_config["output"]
        if not os.path.exists(dir_insert):
            os.makedirs(dir_insert)
        os.chdir(dir_insert)
        #check if file is already present
        bamMerged = "lib_{}.bam".format(insert)
        if numInserts == 1:
            bamMerged = "{}.bam".format(sample_config["output"])

        if os.path.exists(bamMerged):
            BAMfilesMerged[insert] = [os.path.abspath(bamMerged), dir_insert]
            os.chdir("..")
            continue # nothiing to be done for this insert

        if len(insertGroup) == 1: # only one sample file for this insert length
            cl = ["ln", "-s", insertGroup[0], bamMerged]
            returnValue = subprocess.call(cl)
            if  not returnValue == 0:
                sys.exit("error, while soft linking {}".format(insertGroup[0]))
        else:
            command = [samtools, "merge",bamMerged]
            for bamfile in insertGroup:
                command.append(bamfile)

            common.print_command(command)
            returnValue = 0
            if not common.check_dryrun(sample_config):
                returnValue = subprocess.call(command)
                if  not returnValue == 0:
                    sys.exit("error, while merging files {}".format(
                        insertGroup))
        BAMfilesMerged[insert] = [os.path.abspath(bamMerged), dir_insert]
        os.chdir("..")

    sorted_alignments_by_insert = []
    for key in sorted(BAMfilesMerged.keys()):
        sorted_alignments_by_insert.append([key, BAMfilesMerged[key][0],
            BAMfilesMerged[key][1]]) # memorise insert length, bam file, folder
    return sorted_alignments_by_insert
Example #28
0
def _run_soapdenovo(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "soapdenovo"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART
    kmer = 54
    if "kmer" in sample_config:
        kmer = sample_config["kmer"]
    threads = ["-p", "8"] # default for UPPMAX
    if "threads" in sample_config:
        threads = ["-p", "{}".format(sample_config["threads"])]
    soap_config_file = open("configuration.txt", "w")
    soap_config_file.write("max_rd_len=150\n")
    #TODO make this a parameter in the options
    rank = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        soap_config_file.write("[LIB]\n")
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        soap_config_file.write("avg_ins={}\n".format(insert))
        soap_config_file.write("rank={}\n".format(rank))
        rank += 1
        soap_config_file.write("map_len=30\n")
        if orientation=="innie" or orientation=="none":
            soap_config_file.write("asm_flags=3\n")
            soap_config_file.write("pair_num_cutoff=3\n")
            soap_config_file.write("reverse_seq=0\n")
            if read2 is None:
                soap_config_file.write("q={}\n".format(read1))
            else:
                soap_config_file.write("q1={}\n".format(read1))
                soap_config_file.write("q2={}\n".format(read2))
        elif orientation=="outtie":
            soap_config_file.write("asm_flags=2\n")
            soap_config_file.write("pair_num_cutoff=5\n")
            soap_config_file.write("reverse_seq=1\n")
            soap_config_file.write("q1={}\n".format(read1))
            soap_config_file.write("q2={}\n".format(read2))

    soap_config_file.close()
    assembler_stdOut = open("soap.stdOut", "w")
    assembler_stdErr = open("soap.stdErr", "w")
    os.makedirs(os.path.join(assemblyDirectory, "runSOAP"))
    os.chdir("runSOAP")
    #TODO : lots of missing options
    command = [programBIN , "all", "-s", "{}".format(os.path.join(assemblyDirectory, "configuration.txt")), "-K",
            "{}".format(kmer), "-L", "500", "-o", "soapAssembly", threads[0],
            threads[1] ]
    common.print_command(command)
    returnValue = 0
    if not common.check_dryrun(sample_config):
        subprocess.call(command, stdout=assembler_stdOut,
                stderr=assembler_stdErr)
    else:
        os.chdir("..")
        os.chdir("..")
        return sample_config

    os.chdir("..")
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        if(os.path.exists(os.path.join("runSOAP","soapAssembly.scafSeq"))):
            subprocess.call(["cp", os.path.join("runSOAP",
                "soapAssembly.scafSeq"), "{}.scf.fasta".format(outputName)])
            subprocess.call(["cp", os.path.join("runSOAP",
                "soapAssembly.contig"), "{}.ctg.fasta".format(outputName)])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runSOAP"])
        else:
            print("something wrong with SOAPdenovo -> no contig file generated")
    else:
        print("SOAPdenovo terminated with an error. Please check running",
                "folder for more informations")
        os.chdir("..")
        return sample_config
    os.chdir("..")
    return sample_config
Example #29
0
def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert):
    program        = global_config["Tools"]["trimmomatic"]["bin"]
    program_folder = os.path.dirname(program)
    if "adapters" not in sample_config:
        sys.exit("running MP pipeline, adapters file to be used in trimming"
                "are needed for Trimmomatic. Please specify them"
                "in the sample configuration file and rerun")
    adapterFile    = sample_config["adapters"]
    if not os.path.exists(adapterFile):
        sys.exit("Trimmomatic cannot be run as adapter file is not specified"
                "or points to unknown position: {}".format(adapterFile))

    mainDirectory = os.getcwd()
    trimmomaticDir = os.path.join(mainDirectory, "Trimmomatic")
    if not os.path.exists(trimmomaticDir):
        os.makedirs(trimmomaticDir)
    os.chdir(trimmomaticDir)
    #now I am in running dir, I need to process one by one the libraries
    threads = 8
    if "threads" in sample_config:
        threads = sample_config["threads"]

    for library, libraryInfo in sorted_libraries_by_insert:
        read1=libraryInfo["pair1"]
        read2=libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        if read2 is not None:
            read1_baseName = os.path.split(read1)[1].split(".")[0]
            read2_baseName = os.path.split(read2)[1].split(".")[0]
            output_read1_pair = os.path.join(trimmomaticDir,
                    "{}.fastq.gz".format(read1_baseName))
            output_read1_sing = os.path.join(trimmomaticDir,
                    "{}_u.fastq.gz".format(read1_baseName))
            output_read2_pair = os.path.join(trimmomaticDir,
                    "{}.fastq.gz".format(read2_baseName))
            output_read2_sing = os.path.join(trimmomaticDir,
                    "{}_u.fastq.gz".format(read2_baseName))
            command = ["java",  "-jar", program, "PE", "-threads",
                    "{}".format(threads),  "-phred33",  read1, read2,
                    output_read1_pair, output_read1_sing, output_read2_pair,
                    output_read2_sing,
                    "ILLUMINACLIP:{}:2:30:10".format(adapterFile),
                    "LEADING:3", "TRAILING:3", "SLIDINGWINDOW:4:15",
                    "MINLEN:30"]
            common.print_command(command)
            sample_config["commands"] += "\n" + common.get_command_str(command)

            # do not execute is files have been already gennerated
            if not common.check_dryrun(sample_config) and not \
                    os.path.exists(output_read1_pair):
                stdOut = open("{}_trimmomatic.stdOut".format(read1_baseName),
                        "w")
                stdErr = open("{}_trimmomatic.stdErr".format(read1_baseName),
                        "w")
                returnValue = subprocess.call(command, stdout=stdOut,
                        stderr=stdErr) # run the program
                if returnValue != 0:
                    print("error while running command: {}".format(command))
            libraryInfo["pair1"] = output_read1_pair
            libraryInfo["pair2"] = output_read2_pair
            libraryInfo["trimmomatic"] = os.path.join(trimmomaticDir,
                    "{}_trimmomatic.stdErr".format(read1_baseName))
    os.chdir(mainDirectory)
    return sample_config
Example #30
0
def _run_spades(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "spades"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART

    command = ""
    command += "{} ".format(programBIN)
    for option in program_options:
        command += "{} ".format(option)

    #creates the command on-the-fly
    peLibrary = 1
    mpLibrary = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        if orientation=="innie" or orientation=="none":
            if read2 is None:
                command += "--pe{}-s {} ".format(peLibrary, read1)
            else:
                command += "--pe{}-1 {} --pe{}-2 {} ".format(peLibrary, read1,
                        peLibrary, read2)
            peLibrary += 1
        elif orientation=="outtie":
            command += "--mp{}-1 {} --mp{}-2 {} ".format(mpLibrary, read1,
                    mpLibrary, read2)
            mpLibrary += 1
        else:
            print("orientation{} not supported.... why the program did not",
                    "failed earlier?".format(orientation))

    command += "-o {} ".format(outputName)
    common.print_command(command)
    returnValue = 0
    if not common.check_dryrun(sample_config):
        assembler_stdOut = open("spades.stdOut", "a")
        assembler_stdErr = open("spades.stdErr", "a")
        returnValue = subprocess.call(command, stdout=assembler_stdOut,
                stderr=assembler_stdErr, shell=True)
    else:
        return sample_config

    flags = sample_config.get("flags", [])
    if returnValue == 0:
        if os.path.exists(os.path.join(outputName,"contigs.fasta")):
            subprocess.call(["cp", os.path.join(outputName,"contigs.fasta"),
                "{}.ctg.fasta".format(outputName)])
            subprocess.call(["cp", os.path.join(outputName,"scaffolds.fasta"),
                "{}.scf.fasta".format(outputName)])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", outputName])
        else:
            print("something wrong with SPADES -> no contig file generated")
    else:
        print("SPADES terminated with an error. Please check running folder",
                "for more informations")

    os.chdir("..")
    return sample_config
Example #31
0
def _run_allpaths(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler                  = "allpaths"
    outputName                 = sample_config["output"]
    currentDirectory           = os.getcwd()
    assemblyDirectory          = os.path.join(currentDirectory, assembler)
    # in abyss case there is no exectuable
    programBIN                 = global_config["Tools"][assembler]["bin"]
    program_options            = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
    if _prepare_folder_structure("allpaths", assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    inGroups_file = open("in_groups.csv", "w")
    inLibs_file   = open("in_libs.csv", "w")
    inGroups_file.write("group_name, library_name, file_name\n")
    inLibs_file.write("library_name, project_name, organism_name, type, "
            "paired, frag_size, frag_stddev, insert_size, insert_stddev, "
            "read_orientation,genomic_start, genomic_end\n")
    librariesForInLibs     = []
    librariesForInLibsDict = {}
    group_name             = 1;
    for library, libraryInfo in sorted_libraries_by_insert:
        read1       =libraryInfo["pair1"]
        read2       =libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert      = libraryInfo["insert"]
        std         = libraryInfo["std"]
        if orientation=="innie":
            path, fqfile=os.path.split(read1)
            if "_1.fastq" in fqfile:
                fqfile = fqfile.replace("_1.fastq", "_?.fastq")
            elif "_R1_" in fqfile:
                fqfile = fqfile.replace("_R1_", "_R?_")
            else:
                print("error file format not supported {}".format(fqfile))
                return sample_config
            inGroups_file.write("PE{}, lib{}, {}\n".format(group_name, insert,
                os.path.join(path, fqfile)))
            group_name += 1
            if insert not in librariesForInLibsDict:
                librariesForInLibsDict[insert] = insert
                librariesForInLibs.append("lib{}, genome, genome, fragment, 1, "
                        "{}, {}, , , inward, 0, 0\n".format(insert,insert, std))
        elif orientation=="outtie":
            path, fqfile = os.path.split(read1)
            if "_1.fastq" in fqfile:
                fqfile = fqfile.replace("_1.fastq", "_?.fastq")
            elif "_R1_" in fqfile:
                fqfile = fqfile.replace("_R1_", "_R?_")
            else:
                print("error file format not supported {}".format(file))
                return sample_config
            inGroups_file.write("MP{}, lib{}, {}\n".format(group_name, insert,
                os.path.join(path, fqfile)))
            group_name += 1
            if insert not in librariesForInLibsDict:
                librariesForInLibsDict[insert] = insert
                librariesForInLibs.append("lib{}, genome, genome, fragment, 1, "
                        ", , {}, {}, outward, 0, 0\n".format(insert,insert, std))
        else:
            print("all paths support only innies and outties")
    inGroups_file.close()
    for lib in librariesForInLibs:
        inLibs_file.write(lib)
    inLibs_file.close()
    #NOW RUN ALLPATHS FOR REAL
    program=os.path.join(programBIN, "PrepareAllPathsInputs.pl")
    os.mkdir("data_dir")
    data_dir = os.path.join(assemblyDirectory, "data_dir")
    ploidy = "PLOIDY=1"
    if len(program_options) > 0:
        if len(program_options) >1:
            print("Running ALlpaths only one parameter accepted as option",
                    "here: PLOIDY=2")
            return sample_config
        if program_options[0] == "PLOIDY=2":
            ploidy = "PLOIDY=2"
        else:
            print("Running ALlpaths only one parameter accepted as option",
                    "here: PLOIDY=2")
            return sample_config

    command = [program , "DATA_DIR={}".format(data_dir), ploidy,
            "PICARD_TOOLS_DIR={}".format(
            global_config["Tools"]["picard"]["bin"]),  
            "FORCE_PHRED=True", "PHRED_64=False",
            "IN_GROUPS_CSV={}".format(os.path.join(assemblyDirectory,"in_groups.csv")),
            "IN_LIBS_CSV={}".format(os.path.join(assemblyDirectory,"in_libs.csv"))]
    if common.check_dryrun(sample_config):
        common.print_command(command)
        program = os.path.join(programBIN, "RunAllPathsLG")
        command = [program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.", "DATA_SUBDIR=data_dir",
                "RUN=allpaths", "SUBDIR=run"]
        common.print_command(command)
        os.chdir("..")
        return sample_config
    assembler_stdOut = open("allpaths_PrepareAllPathsInputs.stdOut", "w")
    assembler_stdErr = open("allpaths_PrepareAllPathsInputs.stdErr", "w")
    common.print_command(command)
    returnValue = subprocess.call(command,  stdout=assembler_stdOut, 
            stderr=assembler_stdErr)
    assembler_stdOut.close()
    assembler_stdErr.close()
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        program = os.path.join(programBIN, "RunAllPathsLG")
        command = [program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.", "DATA_SUBDIR=data_dir",
                "RUN=allpaths", "SUBDIR=run", "HAPLOIDIFY=True"]
        common.print_command(command)
        assembler_stdOut = open("allpaths_RunAllPathsLG.stdOut", "w")
        assembler_stdErr = open("allpaths_RunAllPathsLG.stdErr", "w")
        returnValue = subprocess.call(command,  stdout=assembler_stdOut,
                stderr=assembler_stdErr)
        if returnValue != 0:
            print("ALLPATHS RunAllPathsLG terminated with an error. Please",
                    "check running folder for more informations")
            os.chdir("..")
            return sample_config
        else: # save results
            assembly_dir = os.path.join("data_dir", "allpaths", "ASSEMBLIES",
                    "run")
            if os.path.exists(os.path.join(assembly_dir,
                "final.assembly.fasta")):
                exit_code = subprocess.call(["cp", os.path.join(assembly_dir,
                    "final.contigs.fasta"), "{}.ctg.fasta".format(outputName)])
                exit_code += subprocess.call(["cp", os.path.join(assembly_dir,
                    "final.assembly.fasta"), "{}.scf.fasta".format(outputName)])
                if not "keep_tmp_files" in flags and exit_code == 0:
                    subprocess.call(["rm", "-r", "data_dir"])
            else:
                print("something wrong with Allpaths > no contig file generated")
                os.chdir("..")
                return sample_config
    else:
        print("ALLPATHS PrepareAllPathInputs terminated with an error. "
                "Please check running folder for more informations")
        os.chdir("..")
        return sample_config
    os.chdir("..")
    return sample_config
Example #32
0
def _run_soapdenovo(global_config, sample_config, sorted_libraries_by_insert):
    ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER
    assembler = "soapdenovo"
    outputName = sample_config["output"]
    currentDirectory = os.getcwd()
    assemblyDirectory = os.path.join(currentDirectory, assembler)
    # in cabog case there is no exectuable
    programBIN = global_config["Tools"][assembler]["bin"]
    program_options = global_config["Tools"][assembler]["options"]
    sorted_libraries_by_insert = common._sort_libraries_by_insert(
        sample_config)
    if _prepare_folder_structure(assembler, assemblyDirectory) == 0:
        os.chdir(assemblyDirectory)
    else:
        return sample_config
    ########### HERE IT START THE SPECIFIC ASSEMBLER PART
    kmer = 54
    if "kmer" in sample_config:
        kmer = sample_config["kmer"]
    threads = ["-p", "8"]  # default for UPPMAX
    if "threads" in sample_config:
        threads = ["-p", "{}".format(sample_config["threads"])]
    soap_config_file = open("configuration.txt", "w")
    soap_config_file.write("max_rd_len=150\n")
    #TODO make this a parameter in the options
    rank = 1
    for library, libraryInfo in sorted_libraries_by_insert:
        soap_config_file.write("[LIB]\n")
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        insert = libraryInfo["insert"]
        std = libraryInfo["std"]
        soap_config_file.write("avg_ins={}\n".format(insert))
        soap_config_file.write("rank={}\n".format(rank))
        rank += 1
        soap_config_file.write("map_len=30\n")
        if orientation == "innie" or orientation == "none":
            soap_config_file.write("asm_flags=3\n")
            soap_config_file.write("pair_num_cutoff=3\n")
            soap_config_file.write("reverse_seq=0\n")
            if read2 is None:
                soap_config_file.write("q={}\n".format(read1))
            else:
                soap_config_file.write("q1={}\n".format(read1))
                soap_config_file.write("q2={}\n".format(read2))
        elif orientation == "outtie":
            soap_config_file.write("asm_flags=2\n")
            soap_config_file.write("pair_num_cutoff=5\n")
            soap_config_file.write("reverse_seq=1\n")
            soap_config_file.write("q1={}\n".format(read1))
            soap_config_file.write("q2={}\n".format(read2))

    soap_config_file.close()
    assembler_stdOut = open("soap.stdOut", "w")
    assembler_stdErr = open("soap.stdErr", "w")
    os.makedirs(os.path.join(assemblyDirectory, "runSOAP"))
    os.chdir("runSOAP")
    #TODO : lots of missing options
    command = [
        programBIN, "all", "-s",
        "{}".format(os.path.join(assemblyDirectory, "configuration.txt")),
        "-K", "{}".format(kmer), "-L", "500", "-o", "soapAssembly", threads[0],
        threads[1]
    ]
    common.print_command(command)
    returnValue = 0
    if not common.check_dryrun(sample_config):
        subprocess.call(command,
                        stdout=assembler_stdOut,
                        stderr=assembler_stdErr)
    else:
        os.chdir("..")
        os.chdir("..")
        return sample_config

    os.chdir("..")
    flags = sample_config.get("flags", [])
    if returnValue == 0:
        if (os.path.exists(os.path.join("runSOAP", "soapAssembly.scafSeq"))):
            subprocess.call([
                "cp",
                os.path.join("runSOAP", "soapAssembly.scafSeq"),
                "{}.scf.fasta".format(outputName)
            ])
            subprocess.call([
                "cp",
                os.path.join("runSOAP", "soapAssembly.contig"),
                "{}.ctg.fasta".format(outputName)
            ])
            if not "keep_tmp_files" in flags:
                subprocess.call(["rm", "-r", "runSOAP"])
        else:
            print(
                "something wrong with SOAPdenovo -> no contig file generated")
    else:
        print("SOAPdenovo terminated with an error. Please check running",
              "folder for more informations")
        os.chdir("..")
        return sample_config
    os.chdir("..")
    return sample_config