def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() FastqcFolder = os.path.join(os.getcwd(), "fastqc") if not os.path.exists(FastqcFolder): os.makedirs(FastqcFolder) program = global_config["Tools"]["fastqc"]["bin"] program_options = global_config["Tools"]["fastqc"]["options"] for library, libraryInfo in sorted_libraries_by_insert: command = [program] for option in program_options: command.append(option) read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] command.append(read1) if read2 is not None: command.append(read2) common.print_command(command) folder_output_name = os.path.join( FastqcFolder, os.path.basename(read1).split(".fastq.gz")[0]) if not common.check_dryrun(sample_config) and not os.path.exists( "{}_fastqc.zip".format(folder_output_name)): fastq_stdOut = open( os.path.join(FastqcFolder, "{}_fastqc.stdout".format(library)), "a") fastq_stdErr = open( os.path.join(FastqcFolder, "{}_fastqc.stderr".format(library)), "a") subprocess.call(command, stdout=fastq_stdOut, stderr=fastq_stdErr) sample_config["fastqc"] = FastqcFolder return sample_config
def _run_qaTools(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() qaToolsFolder = os.path.join(os.getcwd(), "QAstats") if not os.path.exists(qaToolsFolder): os.makedirs(qaToolsFolder) os.chdir("QAstats") program=global_config["Tools"]["qaTools"]["bin"] genomeSize = sample_config["genomeSize"] reference = sample_config["reference"] output = sample_config["output"] alignments = sample_config["alignments"][0] BAMfile = alignments[1] command = ["{}".format(program), "-m", "-q", "0", "-i", BAMfile, "{}.cov".format(os.path.basename(BAMfile))] common.print_command(command) if not common.check_dryrun(sample_config) and not os.path.exists("{}.cov".format(os.path.basename(BAMfile))): stdOut = open("QAtools.stdOut", "a") stdErr = open("QAtools.stdErr", "a") returnValue = subprocess.call(command , stdout=stdOut , stderr=stdErr) if not returnValue == 0: sys.exit("error, while running QAtools: {}".format(command)) #now add GC content QAtools_dict = {} header = "" with open( "{}.cov".format(os.path.basename(BAMfile)), "r") as QA_csv: header = QA_csv.readline().rstrip() for line in QA_csv: line = line.strip().split("\t") QAtools_dict[line[0]] = [line[1],line[2],line[3]] QA_GC_file = "{}.cov.gc".format(os.path.basename(BAMfile)) with open(QA_GC_file, "w") as QA_GC_fd: QA_GC_fd.write("{}\tGCperc\n".format(header)) with open(reference, "r") as ref_fd: fasta_raw_header = ref_fd.readline().strip() fasta_raw_header = fasta_raw_header.split(" ")[0] fasta_raw_header = fasta_raw_header.split("\t")[0] fasta_header = fasta_raw_header.split(">")[1] sequence = "" for line in ref_fd: line = line.strip() if line.startswith(">"): GC = computeGC(sequence) if fasta_header not in QAtools_dict: sys.exit("error while parsing QAcompute output: probably some wired contig name is present in your assmebly file") QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format(fasta_header, QAtools_dict[fasta_header][0], QAtools_dict[fasta_header][1], QAtools_dict[fasta_header][2], GC )) sequence = "" fasta_raw_header = line.split(" ")[0] fasta_raw_header = fasta_raw_header.split("\t")[0] fasta_header = fasta_raw_header.split(">")[1] else: sequence+=line GC = computeGC(sequence) if fasta_header not in QAtools_dict: sys.exit("error while parsing QAcompute output: probably some wired contig name is present in your assmebly file") QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format(fasta_header, QAtools_dict[fasta_header][0], QAtools_dict[fasta_header][1], QAtools_dict[fasta_header][2], GC )) plotQA(QA_GC_file) os.chdir("..") return sample_config
def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() FastqcFolder = os.path.join(os.getcwd(), "fastqc") if not os.path.exists(FastqcFolder): os.makedirs(FastqcFolder) program=global_config["Tools"]["fastqc"]["bin"] program_options=global_config["Tools"]["fastqc"]["options"] for library, libraryInfo in sorted_libraries_by_insert: command = [program] for option in program_options: command.append(option) read1=libraryInfo["pair1"] read2=libraryInfo["pair2"] command.append(read1) if read2 is not None: command.append(read2) common.print_command(command) folder_output_name = os.path.join(FastqcFolder, os.path.basename(read1).split(".fastq.gz")[0]) if not common.check_dryrun(sample_config) and not os.path.exists("{}_fastqc.zip".format(folder_output_name)): fastq_stdOut = open(os.path.join(FastqcFolder , "{}_fastqc.stdout".format(library)), "a") fastq_stdErr = open(os.path.join(FastqcFolder , "{}_fastqc.stderr".format(library)), "a") subprocess.call(command, stdout=fastq_stdOut, stderr=fastq_stdErr) sample_config["fastqc"] = FastqcFolder return sample_config
def _run_FRC(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() FRCurveFolder = os.path.join(os.getcwd(), "FRCurve") if not os.path.exists(FRCurveFolder): os.makedirs(FRCurveFolder) os.chdir("FRCurve") program=global_config["Tools"]["FRC"]["bin"] genomeSize = sample_config["genomeSize"] reference = sample_config["reference"] output = sample_config["output"] alignments = sample_config["alignments"] peBam = alignments[0][1] peInsert = alignments[0][0] peMinInsert = int(peInsert - peInsert*0.60) peMaxInsert = int(peInsert + peInsert*0.60) command = [program, "--pe-sam", peBam, "--pe-min-insert", "{}".format(peMinInsert) , "--pe-max-insert", "{}".format(peMaxInsert), "--CEstats-PE-min", "-4", "--CEstats-PE-max", "4"] if len(alignments) > 1: mpBam = alignments[1][1] mpInsert = alignments[1][0] mpMinInsert = int(mpInsert - mpInsert*0.50) mpMaxInsert = int(mpInsert + mpInsert*0.50) command += ["--mp-sam", mpBam, "--mp-min-insert", "{}".format(mpMinInsert), "--mp-max-insert", "{}".format(mpMaxInsert)] command += [ "--genome-size", "{}".format(genomeSize), "--output", output] common.print_command(command) if not common.check_dryrun(sample_config) and not os.path.exists("{}_FRC.png".format(output)): stdOut = open("FRC.stdOut", "a") stdErr = open("FRC.stdErr", "a") returnValue = subprocess.call(command , stdout=stdOut , stderr=stdErr) if not returnValue == 0: sys.exit("error, while running FRCurve: {}".format(command)) plotFRCurve(output) os.chdir("..") return sample_config
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer") if "kmer" not in sample_config: sys.exit( "error in _run_abyss QCcontrol: kmer must be present in sample_config.yaml" ) kmer = sample_config["kmer"] if not os.path.exists(ABySS_Kmer_Folder): os.makedirs(ABySS_Kmer_Folder) os.chdir(ABySS_Kmer_Folder) program = global_config["Tools"]["abyss"]["bin"] program = os.path.join(os.path.dirname(program), "ABYSS-P") program_options = global_config["Tools"]["abyss"]["options"] if "abyss" in sample_config: program_options = sample_config["abyss"] threads = 16 # default for UPPMAX if "threads" in sample_config: threads = sample_config["threads"] command = "mpirun -np {} {} ".format(threads, program) command += "-k {} ".format(kmer) command += "--coverage-hist=histogram.hist -o preUnitgs.fa" for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] if orientation == "innie" or orientation == "outtie": command += " {} ".format(read1) if read2 is not None: command += " {} ".format(read2) if orientation == "none": command += " {} ".format(read1) common.print_command(command) if not common.check_dryrun(sample_config) and not os.path.exists( "histogram.hist"): ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a") ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a") returnValue = subprocess.call(command, shell=True, stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr) if returnValue > 0: print "ABySS kmer plotting failed: unkwnown reason" else: subprocess.call(("rm", "preUnitgs.fa")) _plotKmerPlot(1, 200, kmer, "kmer_coverage_1_200.png") _plotKmerPlot(1, 500, kmer, "kmer_coverage_1_500.png") _plotKmerPlot(15, 200, kmer, "kmer_coverage_15_200.png") _plotKmerPlot(15, 500, kmer, "kmer_coverage_15_500.png") os.chdir("..") sample_config["abyss"] = ABySS_Kmer_Folder return sample_config
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer") if "kmer" not in sample_config: sys.exit("error in _run_abyss QCcontrol: kmer must be present in sample_config.yaml") kmer = sample_config["kmer"] if not os.path.exists(ABySS_Kmer_Folder): os.makedirs(ABySS_Kmer_Folder) os.chdir(ABySS_Kmer_Folder) program = global_config["Tools"]["abyss"]["bin"] program = os.path.join(os.path.dirname(program), "ABYSS-P") program_options=global_config["Tools"]["abyss"]["options"] if "abyss" in sample_config: program_options=sample_config["abyss"] threads = 16 # default for UPPMAX if "threads" in sample_config : threads = sample_config["threads"] command = "mpirun -np {} {} ".format(threads, program) command += "-k {} ".format(kmer) command += "--coverage-hist=histogram.hist -o preUnitgs.fa" for library, libraryInfo in sorted_libraries_by_insert: read1=libraryInfo["pair1"] read2=libraryInfo["pair2"] orientation = libraryInfo["orientation"] if orientation=="innie" or orientation=="outtie": command += " {} ".format(read1) if read2 is not None: command += " {} ".format(read2) if orientation == "none": command += " {} ".format(read1) common.print_command(command) if not common.check_dryrun(sample_config) and not os.path.exists("histogram.hist"): ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a") ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a") returnValue = subprocess.call(command, shell=True, stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr) if returnValue > 0: print "ABySS kmer plotting failed: unkwnown reason" else : subprocess.call(("rm", "preUnitgs.fa")) _plotKmerPlot(1,200, kmer, "kmer_coverage_1_200.png") _plotKmerPlot(1,500, kmer, "kmer_coverage_1_500.png") _plotKmerPlot(15,200, kmer, "kmer_coverage_15_200.png") _plotKmerPlot(15,500, kmer, "kmer_coverage_15_500.png") os.chdir("..") sample_config["abyss"] = ABySS_Kmer_Folder return sample_config
def main(args): with open(args.global_config) as in_handle: global_config = yaml.load(in_handle) with open(args.sample_config) as sample_config_handle: sample_config = yaml.load(sample_config_handle) check_consistency(global_config,sample_config) if common.check_dryrun(sample_config): print "Option dryrun idenitfied: commands will only be printed, not executed" if sample_config["pipeline"] in global_config["Pipelines"]: run_analys(global_config, sample_config) else: sys.exit("Error: pipeline {} is not one of the supported ones:{}".format(sample_config["pipeline"], global_config["Pipelines"])) return 0
def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert): program = global_config["Tools"]["trimmomatic"]["bin"] program_folder = os.path.dirname(program) if "adapters" not in sample_config: sys.exit("running MP pipeline, adapters file to be used in trimming are needed for Trimmomatic. Please specify them\ in the sample configuration file and rerun") adapterFile = sample_config["adapters"] if not os.path.exists(adapterFile): sys.exit("Trimmomatic cannot be run as adapter file is not specified or points to unknown position: {}".format(adapterFile)) mainDirectory = os.getcwd() trimmomaticDir = os.path.join(mainDirectory, "Trimmomatic") if not os.path.exists(trimmomaticDir): os.makedirs(trimmomaticDir) os.chdir(trimmomaticDir) #now I am in running dir, I need to process one by one the libraries threads = 8 if "threads" in sample_config: threads = sample_config["threads"] for library, libraryInfo in sorted_libraries_by_insert: read1=libraryInfo["pair1"] read2=libraryInfo["pair2"] orientation = libraryInfo["orientation"] if read2 is not None: read1_baseName = os.path.split(read1)[1].split(".")[0] read2_baseName = os.path.split(read2)[1].split(".")[0] output_read1_pair = os.path.join(trimmomaticDir, "{}.fastq.gz".format(read1_baseName)) output_read1_sing = os.path.join(trimmomaticDir, "{}_u.fastq.gz".format(read1_baseName)) output_read2_pair = os.path.join(trimmomaticDir, "{}.fastq.gz".format(read2_baseName)) output_read2_sing = os.path.join(trimmomaticDir, "{}_u.fastq.gz".format(read2_baseName)) command = ["java", "-jar", program, "PE", "-threads", "{}".format(threads), "-phred33", read1, read2, output_read1_pair ,output_read1_sing , output_read2_pair, output_read2_sing ,"ILLUMINACLIP:{}:2:30:10".format(adapterFile), "LEADING:3", "TRAILING:3", "SLIDINGWINDOW:4:15", "MINLEN:30" ] common.print_command(command) if not common.check_dryrun(sample_config) and not os.path.exists(output_read1_pair): # do not execute is files have been already gennerated stdOut = open("{}_trimmomatic.stdOut".format(read1_baseName), "w") stdErr = open("{}_trimmomatic.stdErr".format(read1_baseName), "w") returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) # run the program if returnValue != 0: print "error while running command: {}".format(command) libraryInfo["pair1"] = output_read1_pair libraryInfo["pair2"] = output_read2_pair libraryInfo["trimmomatic"] = os.path.join(trimmomaticDir, "{}_trimmomatic.stdErr".format(read1_baseName)) os.chdir(mainDirectory) return sample_config
def _run_FRC(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() FRCurveFolder = os.path.join(os.getcwd(), "FRCurve") if not os.path.exists(FRCurveFolder): os.makedirs(FRCurveFolder) os.chdir("FRCurve") program = global_config["Tools"]["FRC"]["bin"] genomeSize = sample_config["genomeSize"] reference = sample_config["reference"] output = sample_config["output"] alignments = sample_config["alignments"] peBam = alignments[0][1] peInsert = alignments[0][0] peMinInsert = int(peInsert - peInsert * 0.60) peMaxInsert = int(peInsert + peInsert * 0.60) #command = [program, "--pe-sam", peBam, "--pe-min-insert", "{}".format(peMinInsert) , "--pe-max-insert", "{}".format(peMaxInsert), "--CEstats-PE-min", "-4", "--CEstats-PE-max", "4"] command = [program, "--pe-sam", peBam, "--pe-max-insert", "5000"] if len(alignments) > 1: mpBam = alignments[1][1] mpInsert = alignments[1][0] mpMinInsert = int(mpInsert - mpInsert * 0.50) mpMaxInsert = int(mpInsert + mpInsert * 0.50) #command += ["--mp-sam", mpBam, "--mp-min-insert", "{}".format(mpMinInsert), "--mp-max-insert", "{}".format(mpMaxInsert)] command += ["--mp-sam", mpBam, "--mp-max-insert", "25000"] command += ["--genome-size", "{}".format(genomeSize), "--output", output] common.print_command(command) if not common.check_dryrun(sample_config) and not os.path.exists( "{}_FRC.png".format(output)): stdOut = open("FRC.stdOut", "a") stdErr = open("FRC.stdErr", "a") returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) if not returnValue == 0: sys.exit("error, while running FRCurve: {}".format(command)) plotFRCurve(output) os.chdir("..") return sample_config
def _run_qaTools(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() qaToolsFolder = os.path.join(os.getcwd(), "QAstats") if not os.path.exists(qaToolsFolder): os.makedirs(qaToolsFolder) os.chdir("QAstats") program = global_config["Tools"]["qaTools"]["bin"] genomeSize = sample_config["genomeSize"] reference = sample_config["reference"] output = sample_config["output"] alignments = sample_config["alignments"][0] BAMfile = alignments[1] command = [ "{}".format(program), "-m", "-q", "0", "-i", BAMfile, "{}.cov".format(os.path.basename(BAMfile)) ] common.print_command(command) if not common.check_dryrun(sample_config) and not os.path.exists( "{}.cov".format(os.path.basename(BAMfile))): stdOut = open("QAtools.stdOut", "a") stdErr = open("QAtools.stdErr", "a") returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) if not returnValue == 0: sys.exit("error, while running QAtools: {}".format(command)) #now add GC content QAtools_dict = {} header = "" with open("{}.cov".format(os.path.basename(BAMfile)), "r") as QA_csv: header = QA_csv.readline().rstrip() for line in QA_csv: line = line.strip().split("\t") QAtools_dict[line[0]] = [line[1], line[2], line[3]] QA_GC_file = "{}.cov.gc".format(os.path.basename(BAMfile)) with open(QA_GC_file, "w") as QA_GC_fd: QA_GC_fd.write("{}\tGCperc\n".format(header)) with open(reference, "r") as ref_fd: fasta_raw_header = ref_fd.readline().strip() fasta_raw_header = fasta_raw_header.split(" ")[0] fasta_raw_header = fasta_raw_header.split("\t")[0] fasta_header = fasta_raw_header.split(">")[1] sequence = "" for line in ref_fd: line = line.strip() if line.startswith(">"): GC = computeGC(sequence) if fasta_header not in QAtools_dict: sys.exit( "error while parsing QAcompute output: probably some wired contig name is present in your assmebly file" ) QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format( fasta_header, QAtools_dict[fasta_header][0], QAtools_dict[fasta_header][1], QAtools_dict[fasta_header][2], GC)) sequence = "" fasta_raw_header = line.split(" ")[0] fasta_raw_header = fasta_raw_header.split("\t")[0] fasta_header = fasta_raw_header.split(">")[1] else: sequence += line GC = computeGC(sequence) if fasta_header not in QAtools_dict: sys.exit( "error while parsing QAcompute output: probably some wired contig name is present in your assmebly file" ) QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format( fasta_header, QAtools_dict[fasta_header][0], QAtools_dict[fasta_header][1], QAtools_dict[fasta_header][2], GC)) plotQA(QA_GC_file) os.chdir("..") return sample_config
def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert): program = global_config["Tools"]["trimmomatic"]["bin"] program_folder = os.path.dirname(program) if "adapters" not in sample_config: sys.exit( "running MP pipeline, adapters file to be used in trimming are needed for Trimmomatic. Please specify them\ in the sample configuration file and rerun") adapterFile = sample_config["adapters"] if not os.path.exists(adapterFile): sys.exit( "Trimmomatic cannot be run as adapter file is not specified or points to unknown position: {}" .format(adapterFile)) mainDirectory = os.getcwd() trimmomaticDir = os.path.join(mainDirectory, "Trimmomatic") if not os.path.exists(trimmomaticDir): os.makedirs(trimmomaticDir) os.chdir(trimmomaticDir) #now I am in running dir, I need to process one by one the libraries threads = 8 if "threads" in sample_config: threads = sample_config["threads"] for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] if read2 is not None: read1_baseName = os.path.split(read1)[1].split(".")[0] read2_baseName = os.path.split(read2)[1].split(".")[0] output_read1_pair = os.path.join( trimmomaticDir, "{}.fastq.gz".format(read1_baseName)) output_read1_sing = os.path.join( trimmomaticDir, "{}_u.fastq.gz".format(read1_baseName)) output_read2_pair = os.path.join( trimmomaticDir, "{}.fastq.gz".format(read2_baseName)) output_read2_sing = os.path.join( trimmomaticDir, "{}_u.fastq.gz".format(read2_baseName)) command = [ "java", "-jar", program, "PE", "-threads", "{}".format(threads), "-phred33", read1, read2, output_read1_pair, output_read1_sing, output_read2_pair, output_read2_sing, "ILLUMINACLIP:{}:2:30:10".format(adapterFile), "LEADING:3", "TRAILING:3", "SLIDINGWINDOW:4:15", "MINLEN:30" ] common.print_command(command) if not common.check_dryrun(sample_config) and not os.path.exists( "{}.fastq.gz".format(read1_baseName) ): # do not execute is files have been already gennerated stdOut = open("{}_trimmomatic.stdOut".format(read1_baseName), "w") stdErr = open("{}_trimmomatic.stdErr".format(read1_baseName), "w") returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) # run the program if returnValue != 0: print "error while running command: {}".format(command) libraryInfo["pair1"] = output_read1_pair libraryInfo["pair2"] = output_read2_pair libraryInfo["trimmomatic"] = os.path.join( trimmomaticDir, "{}_trimmomatic.stdErr".format(read1_baseName)) os.chdir(mainDirectory) return sample_config