Beispiel #1
0
def _run_qaTools(global_config, sample_config, sorted_libraries_by_insert):
    mainDir       = os.getcwd()
    qaToolsFolder = os.path.join(os.getcwd(), "QAstats")
    if not os.path.exists(qaToolsFolder):
        os.makedirs(qaToolsFolder)
    os.chdir("QAstats")
    program=global_config["Tools"]["qaTools"]["bin"]

    genomeSize  = sample_config["genomeSize"]
    reference   = sample_config["reference"]
    output      = sample_config["output"]
    alignments  = sample_config["alignments"][0]
    BAMfile     = alignments[1]


    command = ["{}".format(program),  "-m",  "-q", "0", "-i",  BAMfile, "{}.cov".format(os.path.basename(BAMfile))]
    common.print_command(command)
    if not common.check_dryrun(sample_config) and not os.path.exists("{}.cov".format(os.path.basename(BAMfile))):
        stdOut = open("QAtools.stdOut", "a")
        stdErr = open("QAtools.stdErr", "a")
        returnValue = subprocess.call(command , stdout=stdOut , stderr=stdErr)
        if not returnValue == 0:
            sys.exit("error, while running QAtools: {}".format(command))
        #now add GC content
        QAtools_dict = {}
        header       = ""
        with open( "{}.cov".format(os.path.basename(BAMfile)), "r") as QA_csv:
            header = QA_csv.readline().rstrip()
            for line in QA_csv:
                line = line.strip().split("\t")
                QAtools_dict[line[0]] = [line[1],line[2],line[3]]
        QA_GC_file = "{}.cov.gc".format(os.path.basename(BAMfile))
        with open(QA_GC_file, "w") as QA_GC_fd:
            QA_GC_fd.write("{}\tGCperc\n".format(header))
            with open(reference, "r") as ref_fd:
                fasta_raw_header    = ref_fd.readline().strip()
                fasta_raw_header    = fasta_raw_header.split(" ")[0]
                fasta_raw_header    = fasta_raw_header.split("\t")[0]
                fasta_header        = fasta_raw_header.split(">")[1]
                sequence            = ""
                for line in ref_fd:
                    line = line.strip()
                    if line.startswith(">"):
                        GC = computeGC(sequence)
                        if fasta_header not in QAtools_dict:
                            sys.exit("error while parsing QAcompute output: probably some wired contig name is present in your assmebly file")
                        QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format(fasta_header, QAtools_dict[fasta_header][0], QAtools_dict[fasta_header][1], QAtools_dict[fasta_header][2], GC ))
                        sequence = ""
                        fasta_raw_header    = line.split(" ")[0]
                        fasta_raw_header    = fasta_raw_header.split("\t")[0]
                        fasta_header        = fasta_raw_header.split(">")[1]
                    else:
                        sequence+=line
                GC = computeGC(sequence)
                if fasta_header not in QAtools_dict:
                    sys.exit("error while parsing QAcompute output: probably some wired contig name is present in your assmebly file")
                QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format(fasta_header, QAtools_dict[fasta_header][0], QAtools_dict[fasta_header][1], QAtools_dict[fasta_header][2], GC ))
        plotQA(QA_GC_file)
    os.chdir("..")
    return sample_config
Beispiel #2
0
def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    FastqcFolder = os.path.join(os.getcwd(), "fastqc")
    if not os.path.exists(FastqcFolder):
        os.makedirs(FastqcFolder)

    program=global_config["Tools"]["fastqc"]["bin"]
    program_options=global_config["Tools"]["fastqc"]["options"]
    for library, libraryInfo in sorted_libraries_by_insert:
        command = [program]
        for option in program_options:
            command.append(option)
        read1=libraryInfo["pair1"]
        read2=libraryInfo["pair2"]
        command.append(read1)
        if read2 is not None:
            command.append(read2)
        common.print_command(command)
        folder_output_name = os.path.join(FastqcFolder, os.path.basename(read1).split(".fastq.gz")[0])
        if not common.check_dryrun(sample_config) and not os.path.exists("{}_fastqc.zip".format(folder_output_name)):
            fastq_stdOut = open(os.path.join(FastqcFolder , "{}_fastqc.stdout".format(library)), "a")
            fastq_stdErr = open(os.path.join(FastqcFolder , "{}_fastqc.stderr".format(library)), "a")
            subprocess.call(command, stdout=fastq_stdOut, stderr=fastq_stdErr)
    sample_config["fastqc"] = FastqcFolder
    return sample_config
Beispiel #3
0
def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    FastqcFolder = os.path.join(os.getcwd(), "fastqc")
    if not os.path.exists(FastqcFolder):
        os.makedirs(FastqcFolder)

    program = global_config["Tools"]["fastqc"]["bin"]
    program_options = global_config["Tools"]["fastqc"]["options"]
    for library, libraryInfo in sorted_libraries_by_insert:
        command = [program]
        for option in program_options:
            command.append(option)
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        command.append(read1)
        if read2 is not None:
            command.append(read2)
        common.print_command(command)
        folder_output_name = os.path.join(
            FastqcFolder,
            os.path.basename(read1).split(".fastq.gz")[0])
        if not common.check_dryrun(sample_config) and not os.path.exists(
                "{}_fastqc.zip".format(folder_output_name)):
            fastq_stdOut = open(
                os.path.join(FastqcFolder, "{}_fastqc.stdout".format(library)),
                "a")
            fastq_stdErr = open(
                os.path.join(FastqcFolder, "{}_fastqc.stderr".format(library)),
                "a")
            subprocess.call(command, stdout=fastq_stdOut, stderr=fastq_stdErr)
    sample_config["fastqc"] = FastqcFolder
    return sample_config
Beispiel #4
0
def _run_FRC(global_config, sample_config, sorted_libraries_by_insert):
    mainDir       = os.getcwd()
    FRCurveFolder = os.path.join(os.getcwd(), "FRCurve")
    if not os.path.exists(FRCurveFolder):
        os.makedirs(FRCurveFolder)
    os.chdir("FRCurve")
    program=global_config["Tools"]["FRC"]["bin"]

    genomeSize  = sample_config["genomeSize"]
    reference   = sample_config["reference"]
    output      = sample_config["output"]
    alignments  = sample_config["alignments"]
    
    peBam       = alignments[0][1]
    peInsert    = alignments[0][0]
    peMinInsert = int(peInsert - peInsert*0.60)
    peMaxInsert = int(peInsert + peInsert*0.60)
    command = [program, "--pe-sam", peBam, "--pe-min-insert", "{}".format(peMinInsert) , "--pe-max-insert", "{}".format(peMaxInsert), "--CEstats-PE-min", "-4", "--CEstats-PE-max", "4"]
    if len(alignments) > 1:
        mpBam       = alignments[1][1]
        mpInsert    = alignments[1][0]
        mpMinInsert = int(mpInsert - mpInsert*0.50)
        mpMaxInsert = int(mpInsert + mpInsert*0.50)
        command += ["--mp-sam", mpBam, "--mp-min-insert", "{}".format(mpMinInsert), "--mp-max-insert", "{}".format(mpMaxInsert)]
    command += [ "--genome-size", "{}".format(genomeSize), "--output", output]
    common.print_command(command)
    if not common.check_dryrun(sample_config) and not os.path.exists("{}_FRC.png".format(output)):
        stdOut = open("FRC.stdOut", "a")
        stdErr = open("FRC.stdErr", "a")
        returnValue = subprocess.call(command , stdout=stdOut , stderr=stdErr)
        if not returnValue == 0:
            sys.exit("error, while running FRCurve: {}".format(command))
        plotFRCurve(output)
    os.chdir("..")
    return sample_config
Beispiel #5
0
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer")
    if "kmer" not in sample_config:
        sys.exit(
            "error in _run_abyss QCcontrol: kmer must be present in sample_config.yaml"
        )

    kmer = sample_config["kmer"]
    if not os.path.exists(ABySS_Kmer_Folder):
        os.makedirs(ABySS_Kmer_Folder)

    os.chdir(ABySS_Kmer_Folder)

    program = global_config["Tools"]["abyss"]["bin"]
    program = os.path.join(os.path.dirname(program), "ABYSS-P")
    program_options = global_config["Tools"]["abyss"]["options"]
    if "abyss" in sample_config:
        program_options = sample_config["abyss"]

    threads = 16  # default for UPPMAX
    if "threads" in sample_config:
        threads = sample_config["threads"]

    command = "mpirun -np {} {} ".format(threads, program)
    command += "-k {} ".format(kmer)
    command += "--coverage-hist=histogram.hist -o preUnitgs.fa"
    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        if orientation == "innie" or orientation == "outtie":
            command += " {} ".format(read1)
            if read2 is not None:
                command += " {} ".format(read2)
        if orientation == "none":
            command += " {} ".format(read1)

    common.print_command(command)
    if not common.check_dryrun(sample_config) and not os.path.exists(
            "histogram.hist"):
        ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a")
        ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a")
        returnValue = subprocess.call(command,
                                      shell=True,
                                      stdout=ABySS_Kmer_stdOut,
                                      stderr=ABySS_Kmer_stdErr)
        if returnValue > 0:
            print "ABySS kmer plotting failed: unkwnown reason"
        else:
            subprocess.call(("rm", "preUnitgs.fa"))
            _plotKmerPlot(1, 200, kmer, "kmer_coverage_1_200.png")
            _plotKmerPlot(1, 500, kmer, "kmer_coverage_1_500.png")
            _plotKmerPlot(15, 200, kmer, "kmer_coverage_15_200.png")
            _plotKmerPlot(15, 500, kmer, "kmer_coverage_15_500.png")

    os.chdir("..")
    sample_config["abyss"] = ABySS_Kmer_Folder
    return sample_config
Beispiel #6
0
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer")
    if "kmer" not in sample_config:
        sys.exit("error in _run_abyss QCcontrol: kmer must be present in sample_config.yaml")
    
    kmer = sample_config["kmer"]
    if not os.path.exists(ABySS_Kmer_Folder):
        os.makedirs(ABySS_Kmer_Folder)
    
    os.chdir(ABySS_Kmer_Folder)

    program = global_config["Tools"]["abyss"]["bin"]
    program = os.path.join(os.path.dirname(program), "ABYSS-P")
    program_options=global_config["Tools"]["abyss"]["options"]
    if "abyss" in sample_config:
        program_options=sample_config["abyss"]
    
    threads = 16 # default for UPPMAX
    if "threads" in sample_config :
        threads = sample_config["threads"]

    command = "mpirun -np {} {} ".format(threads, program)
    command += "-k {} ".format(kmer)
    command += "--coverage-hist=histogram.hist -o preUnitgs.fa"
    for library, libraryInfo in sorted_libraries_by_insert:
        read1=libraryInfo["pair1"]
        read2=libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        if orientation=="innie" or orientation=="outtie":
            command += " {} ".format(read1)
            if read2 is not None:
                command += " {} ".format(read2)
        if orientation == "none":
            command += " {} ".format(read1)

    common.print_command(command)
    if not common.check_dryrun(sample_config) and not os.path.exists("histogram.hist"):
        ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a")
        ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a")
        returnValue = subprocess.call(command, shell=True, stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr)
        if returnValue > 0:
            print "ABySS kmer plotting failed: unkwnown reason"
        else :
            subprocess.call(("rm", "preUnitgs.fa"))
            _plotKmerPlot(1,200, kmer, "kmer_coverage_1_200.png")
            _plotKmerPlot(1,500, kmer, "kmer_coverage_1_500.png")
            _plotKmerPlot(15,200, kmer, "kmer_coverage_15_200.png")
            _plotKmerPlot(15,500, kmer, "kmer_coverage_15_500.png")

    os.chdir("..")
    sample_config["abyss"] = ABySS_Kmer_Folder
    return sample_config
Beispiel #7
0
def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert):
    program        = global_config["Tools"]["trimmomatic"]["bin"]
    program_folder = os.path.dirname(program)
    if "adapters" not in sample_config:
        sys.exit("running MP pipeline, adapters file to be used in trimming are needed for Trimmomatic. Please specify them\
        in the sample configuration file and rerun")
    adapterFile    = sample_config["adapters"]
    if not os.path.exists(adapterFile):
        sys.exit("Trimmomatic cannot be run as adapter file is not specified or points to unknown position: {}".format(adapterFile))

    mainDirectory   = os.getcwd()
    trimmomaticDir  = os.path.join(mainDirectory, "Trimmomatic")
    if not os.path.exists(trimmomaticDir):
        os.makedirs(trimmomaticDir)
    os.chdir(trimmomaticDir)
    #now I am in running dir, I need to process one by one the libraries
    threads = 8
    if "threads" in sample_config:
        threads = sample_config["threads"]

    for library, libraryInfo in sorted_libraries_by_insert:
        read1=libraryInfo["pair1"]
        read2=libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        if read2 is not None:
            read1_baseName = os.path.split(read1)[1].split(".")[0]
            read2_baseName = os.path.split(read2)[1].split(".")[0]
            output_read1_pair = os.path.join(trimmomaticDir,  "{}.fastq.gz".format(read1_baseName))
            output_read1_sing = os.path.join(trimmomaticDir, "{}_u.fastq.gz".format(read1_baseName))
            output_read2_pair = os.path.join(trimmomaticDir, "{}.fastq.gz".format(read2_baseName))
            output_read2_sing = os.path.join(trimmomaticDir,  "{}_u.fastq.gz".format(read2_baseName))
            command = ["java",  "-jar", program, "PE", "-threads", "{}".format(threads),  "-phred33",  read1, read2,  output_read1_pair ,output_read1_sing , output_read2_pair, output_read2_sing ,"ILLUMINACLIP:{}:2:30:10".format(adapterFile), "LEADING:3", "TRAILING:3", "SLIDINGWINDOW:4:15", "MINLEN:30" ]
            common.print_command(command)
            if not common.check_dryrun(sample_config) and not os.path.exists(output_read1_pair): # do not execute is files have been already gennerated
                stdOut = open("{}_trimmomatic.stdOut".format(read1_baseName), "w")
                stdErr = open("{}_trimmomatic.stdErr".format(read1_baseName), "w")
                returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) # run the program
                if returnValue != 0:
                    print "error while running command: {}".format(command)
            libraryInfo["pair1"]       = output_read1_pair
            libraryInfo["pair2"]       = output_read2_pair
            libraryInfo["trimmomatic"] = os.path.join(trimmomaticDir, "{}_trimmomatic.stdErr".format(read1_baseName))
    os.chdir(mainDirectory)
    return sample_config
Beispiel #8
0
def _run_FRC(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    FRCurveFolder = os.path.join(os.getcwd(), "FRCurve")
    if not os.path.exists(FRCurveFolder):
        os.makedirs(FRCurveFolder)
    os.chdir("FRCurve")
    program = global_config["Tools"]["FRC"]["bin"]

    genomeSize = sample_config["genomeSize"]
    reference = sample_config["reference"]
    output = sample_config["output"]
    alignments = sample_config["alignments"]

    peBam = alignments[0][1]
    peInsert = alignments[0][0]
    peMinInsert = int(peInsert - peInsert * 0.60)
    peMaxInsert = int(peInsert + peInsert * 0.60)
    #command = [program, "--pe-sam", peBam, "--pe-min-insert", "{}".format(peMinInsert) , "--pe-max-insert", "{}".format(peMaxInsert), "--CEstats-PE-min", "-4", "--CEstats-PE-max", "4"]
    command = [program, "--pe-sam", peBam, "--pe-max-insert", "5000"]
    if len(alignments) > 1:
        mpBam = alignments[1][1]
        mpInsert = alignments[1][0]
        mpMinInsert = int(mpInsert - mpInsert * 0.50)
        mpMaxInsert = int(mpInsert + mpInsert * 0.50)
        #command += ["--mp-sam", mpBam, "--mp-min-insert", "{}".format(mpMinInsert), "--mp-max-insert", "{}".format(mpMaxInsert)]
        command += ["--mp-sam", mpBam, "--mp-max-insert", "25000"]
    command += ["--genome-size", "{}".format(genomeSize), "--output", output]
    common.print_command(command)
    if not common.check_dryrun(sample_config) and not os.path.exists(
            "{}_FRC.png".format(output)):
        stdOut = open("FRC.stdOut", "a")
        stdErr = open("FRC.stdErr", "a")
        returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr)
        if not returnValue == 0:
            sys.exit("error, while running FRCurve: {}".format(command))
        plotFRCurve(output)
    os.chdir("..")
    return sample_config
Beispiel #9
0
def _run_qaTools(global_config, sample_config, sorted_libraries_by_insert):
    mainDir = os.getcwd()
    qaToolsFolder = os.path.join(os.getcwd(), "QAstats")
    if not os.path.exists(qaToolsFolder):
        os.makedirs(qaToolsFolder)
    os.chdir("QAstats")
    program = global_config["Tools"]["qaTools"]["bin"]

    genomeSize = sample_config["genomeSize"]
    reference = sample_config["reference"]
    output = sample_config["output"]
    alignments = sample_config["alignments"][0]
    BAMfile = alignments[1]

    command = [
        "{}".format(program), "-m", "-q", "0", "-i", BAMfile,
        "{}.cov".format(os.path.basename(BAMfile))
    ]
    common.print_command(command)
    if not common.check_dryrun(sample_config) and not os.path.exists(
            "{}.cov".format(os.path.basename(BAMfile))):
        stdOut = open("QAtools.stdOut", "a")
        stdErr = open("QAtools.stdErr", "a")
        returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr)
        if not returnValue == 0:
            sys.exit("error, while running QAtools: {}".format(command))
        #now add GC content
        QAtools_dict = {}
        header = ""
        with open("{}.cov".format(os.path.basename(BAMfile)), "r") as QA_csv:
            header = QA_csv.readline().rstrip()
            for line in QA_csv:
                line = line.strip().split("\t")
                QAtools_dict[line[0]] = [line[1], line[2], line[3]]
        QA_GC_file = "{}.cov.gc".format(os.path.basename(BAMfile))
        with open(QA_GC_file, "w") as QA_GC_fd:
            QA_GC_fd.write("{}\tGCperc\n".format(header))
            with open(reference, "r") as ref_fd:
                fasta_raw_header = ref_fd.readline().strip()
                fasta_raw_header = fasta_raw_header.split(" ")[0]
                fasta_raw_header = fasta_raw_header.split("\t")[0]
                fasta_header = fasta_raw_header.split(">")[1]
                sequence = ""
                for line in ref_fd:
                    line = line.strip()
                    if line.startswith(">"):
                        GC = computeGC(sequence)
                        if fasta_header not in QAtools_dict:
                            sys.exit(
                                "error while parsing QAcompute output: probably some wired contig name is present in your assmebly file"
                            )
                        QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format(
                            fasta_header, QAtools_dict[fasta_header][0],
                            QAtools_dict[fasta_header][1],
                            QAtools_dict[fasta_header][2], GC))
                        sequence = ""
                        fasta_raw_header = line.split(" ")[0]
                        fasta_raw_header = fasta_raw_header.split("\t")[0]
                        fasta_header = fasta_raw_header.split(">")[1]
                    else:
                        sequence += line
                GC = computeGC(sequence)
                if fasta_header not in QAtools_dict:
                    sys.exit(
                        "error while parsing QAcompute output: probably some wired contig name is present in your assmebly file"
                    )
                QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format(
                    fasta_header, QAtools_dict[fasta_header][0],
                    QAtools_dict[fasta_header][1],
                    QAtools_dict[fasta_header][2], GC))
        plotQA(QA_GC_file)
    os.chdir("..")
    return sample_config
Beispiel #10
0
def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert):
    program = global_config["Tools"]["trimmomatic"]["bin"]
    program_folder = os.path.dirname(program)
    if "adapters" not in sample_config:
        sys.exit(
            "running MP pipeline, adapters file to be used in trimming are needed for Trimmomatic. Please specify them\
        in the sample configuration file and rerun")
    adapterFile = sample_config["adapters"]
    if not os.path.exists(adapterFile):
        sys.exit(
            "Trimmomatic cannot be run as adapter file is not specified or points to unknown position: {}"
            .format(adapterFile))

    mainDirectory = os.getcwd()
    trimmomaticDir = os.path.join(mainDirectory, "Trimmomatic")
    if not os.path.exists(trimmomaticDir):
        os.makedirs(trimmomaticDir)
    os.chdir(trimmomaticDir)
    #now I am in running dir, I need to process one by one the libraries
    threads = 8
    if "threads" in sample_config:
        threads = sample_config["threads"]

    for library, libraryInfo in sorted_libraries_by_insert:
        read1 = libraryInfo["pair1"]
        read2 = libraryInfo["pair2"]
        orientation = libraryInfo["orientation"]
        if read2 is not None:
            read1_baseName = os.path.split(read1)[1].split(".")[0]
            read2_baseName = os.path.split(read2)[1].split(".")[0]
            output_read1_pair = os.path.join(
                trimmomaticDir, "{}.fastq.gz".format(read1_baseName))
            output_read1_sing = os.path.join(
                trimmomaticDir, "{}_u.fastq.gz".format(read1_baseName))
            output_read2_pair = os.path.join(
                trimmomaticDir, "{}.fastq.gz".format(read2_baseName))
            output_read2_sing = os.path.join(
                trimmomaticDir, "{}_u.fastq.gz".format(read2_baseName))
            command = [
                "java", "-jar", program, "PE", "-threads",
                "{}".format(threads), "-phred33", read1, read2,
                output_read1_pair, output_read1_sing, output_read2_pair,
                output_read2_sing,
                "ILLUMINACLIP:{}:2:30:10".format(adapterFile), "LEADING:3",
                "TRAILING:3", "SLIDINGWINDOW:4:15", "MINLEN:30"
            ]
            common.print_command(command)
            if not common.check_dryrun(sample_config) and not os.path.exists(
                    "{}.fastq.gz".format(read1_baseName)
            ):  # do not execute is files have been already gennerated
                stdOut = open("{}_trimmomatic.stdOut".format(read1_baseName),
                              "w")
                stdErr = open("{}_trimmomatic.stdErr".format(read1_baseName),
                              "w")
                returnValue = subprocess.call(command,
                                              stdout=stdOut,
                                              stderr=stdErr)  # run the program
                if returnValue != 0:
                    print "error while running command: {}".format(command)
            libraryInfo["pair1"] = output_read1_pair
            libraryInfo["pair2"] = output_read2_pair
            libraryInfo["trimmomatic"] = os.path.join(
                trimmomaticDir, "{}_trimmomatic.stdErr".format(read1_baseName))
    os.chdir(mainDirectory)
    return sample_config