# Read configuration files config = util.readConfigurationFiles() header = config.getboolean("server", "PBS_header") # Get samples and conditions samples = util.getMergedsamples() # Create scripts directory, if it does not exist yet, and cd to it. util.makeDirectory(scriptsDirectory) os.chdir(scriptsDirectory) # Create output directory, if it does not exist yet. util.makeDirectory(outputDirectory) for sample in samples: # Create script scriptName = "deduplicatebismark_" + sample + ".sh" script = open(scriptName, 'w') if header: util.writeHeader(script, config, "deduplicate_bismark") script.write("deduplicate_bismark " + "\\\n") script.write("--paired " + "\\\n") inputFile = glob.glob(inputDirectory + "/" + sample + "/*" + args.extension)[0] script.write(inputFile + " \\\n") script.write("&> " + scriptName + ".log") script.close()
if not os.path.exists(outputDirectory): os.mkdir(outputDirectory) # Cycle through all the samples and write the bsmap scripts. for line in samplesFile: sample = line.split()[2] if multipleLanes: lane = line.split()[3] sample = sample + "_" + lane # Create output directory for the sample. if not os.path.exists(outputDirectory + "/" + sample): os.mkdir(outputDirectory + "/" + sample) file_R1 = line.split()[0] file_R2 = line.split()[1] # Create script file. scriptName = 'bsmap_' + sample + '.sh' script = open(scriptName, 'w') util.writeHeader(script, config, "bsmap") script.write("bsmap -r 0 -s 16 -n 1" + " \\\n") script.write("-a " + inputDirectory + "/" + file_R1 + " \\\n") script.write("-b " + inputDirectory + "/" + file_R2 + " \\\n") script.write("-d " + genomeFile + " \\\n") script.write("-p " + processors + " \\\n") script.write("-o " + outputDirectory + "/" + sample + ".sam" + " \\\n") script.write("&> " + scriptName + ".log") script.close() if (args.submitJobsToQueue.lower() == "yes") | (args.submitJobsToQueue.lower() == "y"): subprocess.call("submitJobs.py", shell=True)
if not os.path.exists(scriptsDirectory): os.mkdir(scriptsDirectory) os.chdir(scriptsDirectory) # Create output directory, if it does not exist yet. if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) ######################## # stringtie.sh scripts # ######################## for index, row in samplesFile.iterrows(): sample = row["sample"] if not os.path.exists(os.path.join(outputDirectory, sample)): os.makedirs(os.path.join(outputDirectory, sample)) scriptName = "stringtie_" + sample + ".sh" script = open(scriptName, "w") if header: util.writeHeader(script, config, "stringtie") script.write("stringtie \\\n") script.write(os.path.relpath(os.path.join(inputDirectory, sample, sample + "Aligned.sortedByCoord.out.bam")) + " \\\n") script.write("-G " + gtfFile + " \\\n") script.write("-o " + os.path.relpath(os.path.join(outputDirectory, sample, sample + ".gtf")) + " \\\n") script.write("-A " + os.path.relpath(os.path.join(outputDirectory, sample, sample + "_gene_abund.tab")) + " \\\n") script.write("-C " + os.path.relpath(os.path.join(outputDirectory, sample, sample + "_cov_refs.gtf")) + " \\\n") script.write("-p " + processors + " \\\n") script.write("-B" + " \\\n") script.write("&> " + scriptName + ".log") if (args.submitJobsToQueue.lower() == "yes") | (args.submitJobsToQueue.lower() == "y"): subprocess.call("submitJobs.py", shell=True)
sample = row["sample"] if "Lane" in samplesFile.columns: sample = sample + "_lane_" + str(row["Lane"]) # Create output directories if not os.path.exists(outputDirectory + "/" + sample): os.mkdir(outputDirectory + "/" + sample) file_R1 = row["File_R1"] # If trimmed with trim_galore, extensions have changed. file_R1 = file_R1.replace("R1.fastq.gz", "R1_val_1.fq.gz") if "File_R2" in samplesFile.columns: file_R2 = row["File_R2"] # If trimmed with trim_galore, extensions have changed. file_R2 = file_R2.replace("R2.fastq.gz", "R2_val_2.fq.gz") # Create script file. scriptName = 'bismark_' + sample + '.sh' script = open(scriptName, 'w') if header: util.writeHeader(script, config, "bismark") script.write("bismark" + " \\\n") script.write("--bowtie2" + " \\\n") script.write("--basename " + sample + " \\\n") script.write("-output_dir " + outputDirectory + "/" + sample + " \\\n") script.write(bisulfiteGenomeFolder + " \\\n") script.write("-1 " + inputDirectory + "/" + file_R1 + " \\\n") script.write("-2 " + inputDirectory + "/" + file_R2 + " \\\n") script.write("&> " + scriptName + ".log") if (args.submitJobsToQueue.lower() == "yes") | (args.submitJobsToQueue.lower() == "y"): subprocess.call("submitJobs.py", shell=True)
genomeFile = config.get(genome, "genomeFile") samples = util.getMergedsamples() # Create script and output directories, if they do not exist yet. util.makeDirectory(outputDirectory) util.makeDirectory(scriptsDirectory) # CD to scripts directory os.chdir(scriptsDirectory) # Write scripts for sample in samples: scriptName = "calculatehsmetrics_" + sample + ".sh" script = open(scriptName, "w") if header: util.writeHeader(script, config, "calculatehsmetrics") # Reorder script.write("java -Xmx4g -Xms4g -jar " + os.path.join(picard_folder, "CalculateHsMetrics.jar") + " \\\n") script.write("BAIT_INTERVALS=" + os.path.join("../../results/bait_intervals", sample + "_design_bait_intervals.txt") + " \\\n") script.write("TARGET_INTERVALS=" + os.path.join("../../results/target_intervals", sample + "_design_target_intervals.txt") + " \\\n") script.write("INPUT=" + os.path.join(inputDirectory, sample + ".filtered.bam") + " \\\n") script.write("OUTPUT=" + os.path.join(outputDirectory, sample + "_picard_hs_metrics.txt") + " \\\n") script.write("METRIC_ACCUMULATION_LEVEL=ALL_READS " + "\\\n") script.write("REFERENCE_SEQUENCE=" + genomeFile + " \\\n") script.write("VALIDATION_STRINGENCY=LENIENT " + "\\\n") script.write("&> " + scriptName + ".log") script.close()
samplesFile = util.readsamplesFile() samples = samplesFile["sample"].tolist() # Create scripts directory, if it does not exist yet, and cd to it. if not os.path.exists(scriptsDirectory): os.mkdir(scriptsDirectory) os.chdir(scriptsDirectory) # Create output directory, if it does not exist yet. if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) # Cycle through all the samples and write the bedtools_coverage scripts. for sample in samples: # Create script file. scriptName = "bedtools_coverage_" + sample + ".sh" script = open(scriptName, 'w') if header: util.writeHeader(script, config, "bedtools_coverage") script.write("bedtools coverage" + " \\\n") script.write("-a " + os.path.relpath(a) + " \\\n") script.write("-b " + os.path.relpath(os.path.join(inputDirectory, sample, sample + ".bam")) + " \\\n") script.write("-g " + os.path.relpath(chromSizes) + " \\\n") script.write("-hist" + " \\\n") script.write("-sorted" + " \\\n") script.write("1> " + os.path.relpath(os.path.join(outputDirectory, sample + ".txt")) + " \\\n") script.write("2> " + scriptName + ".log") if (args.submitJobsToQueue.lower() == "yes") | (args.submitJobsToQueue.lower() == "y"): subprocess.call("submitJobs.py", shell=True)
# Store the list of files with the extensions fastq or fastq.gz files = glob.glob(inputDirectory + "/*.fastq") + glob.glob(inputDirectory + "/*.fastq.gz") files.sort() # Write the script(s) # Cycle through all the files, 2 by 2. for i in range(0, len(files), 2): fileR1 = os.path.basename(files[i]) fileR2 = os.path.basename(files[i + 1]) # Create script file. scriptName = 'trimgalore_' + fileR1.replace("_R1", "") + '.sh' script = open(scriptName, 'w') if header: util.writeHeader(script, config, "trimmomatic") script.write("trim_galore" + " \\\n") script.write("--fastqc" + " \\\n") script.write('--fastqc_args "--outdir ' + fastqcoutputDirectory + '"' + " \\\n") script.write("--paired" + " \\\n") script.write("--length 50" + " \\\n") script.write("--output_dir " + outputDirectory + " \\\n") script.write(os.path.join(inputDirectory, fileR1) + " \\\n") script.write(os.path.join(inputDirectory, fileR2) + " \\\n") script.write("&> " + scriptName + ".log") script.close() if (args.submitJobsToQueue.lower() == "yes") | (args.submitJobsToQueue.lower() == "y"): subprocess.call("submitJobs.py", shell=True)
event)): os.mkdir(os.path.join(outputDirectory, comparison, event)) # Create script for each count type. for countType in countTypes: # Create countType subdirectory in event directory, if it does not exist yet. if not os.path.exists( os.path.join(outputDirectory, comparison, event, countType.lower())): os.mkdir( os.path.join(outputDirectory, comparison, event, countType.lower())) scriptName = "rmats2sashimiplots_" + comparison + "_" + event + "_" + countType.lower( ) + ".sh" script = open(scriptName, 'w') if header: util.writeHeader(script, config, "rmats2sashimiplots") script.write( "# Deactivate Python 3 virtual environment, and activate Python 2 virtual environment" + "\n") script.write("source " + os.path.join( toolsFolder, "python_environments/python2.7/bin/activate") + "\n") script.write("\n") script.write("rmats2sashimiplot" + " \\\n") script.write("-b1 ") for sample in samples1[:-1]: script.write( os.path.relpath( os.path.join(bamDirectory, sample, sample + ".bam")) + ",") script.write(
# Create output directory, if it does not exist yet. if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) # Write assembly_GTF_list.txt assembly_GTF_list = open("assembly_GTF_list.txt", "w") for sample in samples: assembly_GTF_list.write( os.path.abspath( os.path.join("../../results/cufflinks", sample, "transcripts.gtf")) + "\n") # Write the cuffmerge script. scriptName = "cuffmerge.sh" script = open(scriptName, 'w') if header: util.writeHeader(script, config, "cuffmerge") script.write("cuffmerge" + " \\\n") script.write("--ref-gtf " + gtfFile + " \\\n") script.write("--num-threads " + processors + " \\\n") script.write("--ref-sequence " + genomeFile + " \\\n") script.write("-o " + outputDirectory + " \\\n") script.write("assembly_GTF_list.txt" + " \\\n") script.write("&> " + scriptName + ".log") script.close() if (args.submitJobsToQueue.lower() == "yes") | (args.submitJobsToQueue.lower() == "y"): subprocess.call("submitJobs.py", shell=True)
# Create output directory, if it does not exist yet. if not os.path.exists(outputDirectory): os.mkdir(outputDirectory) # Create scripts directory, if it does not exist yet, and cd to it. if not os.path.exists(scriptsDirectory): os.mkdir(scriptsDirectory) os.chdir(scriptsDirectory) # Write the scripts for sample in samples: # Write the script scriptName = "haplotypecaller_" + sample + ".sh" script = open(scriptName, "w") if header: util.writeHeader(script, config, "haplotypecaller") script.write("java -Xmx" + xmx + " \\\n") script.write("-jar " + os.path.join(toolsFolder, "GenomeAnalysisTK.jar") + " \\\n") script.write("--analysis_type HaplotypeCaller" + " \\\n") script.write("--emitRefConfidence GVCF" + " \\\n") script.write("--variant_index_type LINEAR" + " \\\n") script.write("--variant_index_parameter 128000" + " \\\n") script.write("--reference_sequence " + genomeFile + " \\\n") script.write("--input_file " + os.path.join(inputDirectory, sample, sample + "_realigned_reads.bam") + " \\\n") script.write("--out " + os.path.join(outputDirectory, sample + ".vcf") + " \\\n") script.write("&> " + scriptName + ".log")
# Create scripts directory, if it does not exist yet, and cd to it. if not os.path.exists(scriptsDirectory): os.mkdir(scriptsDirectory) os.chdir(scriptsDirectory) # Create output directory, if it does not exist yet. if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) # Cycle through all the samples and write the scripts. for index, row in samplesFile.iterrows(): sample = row["sample"] # Create script file. scriptName = 'ensemblbedgraphtoucscbedgraph_' + sample + '.sh' script = open(scriptName, 'w') util.writeHeader(script, config, "ensemblbedgraphtoucscbedgraph") script.write("ensemblbedgraphtoucscbedgraph.py " + "\\\n") script.write("--ensembl_bedgraph " + os.path.join(inputDirectory, sample + ".bedgraph") + " \\\n") script.write("--ucsc_bedgraph " + os.path.join(outputDirectory, sample + ".bedgraph") + " \\\n") script.write("--dictionary " + dictionary + " \\\n") script.write("&> " + scriptName + ".log") script.close() if stranded: # Create positive strand script file. scriptName = 'ensemblbedgraphtoucscbedgraph_' + sample + '_positive.sh' script = open(scriptName, 'w') util.writeHeader(script, config, "ensemblbedgraphtoucscbedgraph") script.write("ensemblbedgraphtoucscbedgraph.py " + "\\\n") script.write("--ensembl_bedgraph " +
os.mkdir(scriptsDirectory) os.chdir(scriptsDirectory) # Create output directories, if they do not exist yet.. if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) # Cycle through all the samples and write the bwa scripts. for index, row in samplesFile.iterrows(): sample = row["sample"] if "Lane" in samplesFile.columns: sample = sample + "_lane_" + str(row["Lane"]) scriptName = "markduplicates_" + sample + ".sh" script = open(scriptName, "w") if header: util.writeHeader(script, config, "markduplicates") # Reorder script.write("java -jar " + picard_folder + "/MarkDuplicates.jar " + "\\\n") inputFile = glob.glob(inputDirectory + "/" + sample + "/*" + extension)[0] script.write("INPUT=" + inputFile + "\\\n") #script.write("OUTPUT=" + outputDirectory + "/" + sample + "/" + sample + "_deduplicated.bam " + "\\\n") script.write("OUTPUT=" + outputDirectory + "/" + sample + "_deduplicated.bam " + "\\\n") #script.write("METRICS_FILE=" + outputDirectory + "/" + sample + "/" + sample + "_deduplication_metrics.txt " + "\\\n") script.write("METRICS_FILE=" + outputDirectory + "/" + sample + "_deduplication_metrics.txt " + "\\\n") if remove_duplicates: script.write("REMOVE_DUPLICATES=true " + "\\\n") else: script.write("REMOVE_DUPLICATES=false " + "\\\n")
# Create a symbolic link to accepted_hits.bam named sampleName.bam. Useful for TopHat output, which is named accepted_hits.bam, by default #def createSymbolicLinks(): # for sample in samples: # command = "ln -fs " + os.path.join(inputDirectory, sample, "accepted_hits.bam") + " " + os.path.join(inputDirectory, sample, sample + ".bam") # print(command) # subprocess.call(command, shell=True) #if (args.symbolicLinks.lower() == "yes") | (args.symbolicLinks.lower() == "y"): # createSymbolicLinks() strands = ["positive", "negative"] # Write script for each sample and each strand for sample in samples: for strand in strands: # Create script file. scriptName = "separatebamsbystrand_" + sample + "_" + strand + ".sh" script = open(scriptName, "w") if header: util.writeHeader(script, config, "separatebambystrand") # BAM to bedgraph script.write("separatebambystrand.py" + " \\\n") script.write("--input_bam " + os.path.relpath( os.path.join(inputDirectory, sample, sample + ".bam")) + " \\\n") script.write("--strand " + strand + " \\\n") script.write("&> " + scriptName + ".log") if (args.submitJobsToQueue.lower() == "yes") | (args.submitJobsToQueue.lower() == "y"): subprocess.call("submitJobs.py", shell=True)
lane = "" file_r1 = os.path.splitext( os.path.basename(file_r1))[0] + lane + os.path.splitext( os.path.basename(file_r1))[1] file_r2 = os.path.splitext( os.path.basename(file_r2))[0] + lane + os.path.splitext( os.path.basename(file_r2))[1] sample += lane # Create output directories if not os.path.exists(outputDirectory + "/" + sample): os.mkdir(outputDirectory + "/" + sample) # Create script file. scriptName = "star_" + sample + ".sh" script = open(scriptName, 'w') if header: util.writeHeader(script, config, "star") script.write("STAR " + "\\\n") script.write("--runMode alignReads" + " \\\n") script.write("--runThreadN " + runThreadN + " \\\n") script.write("--genomeDir " + starIndex + " \\\n") script.write("--sjdbOverhang " + str(int(readLength) - 1) + " \\\n") if not sjdbGTFfile == "None": script.write("--sjdbGTFfile " + sjdbGTFfile + " \\\n") script.write("--readFilesIn " + "\\\n") script.write( os.path.relpath(os.path.join(inputDirectory, file_r1)) + " \\\n") script.write( os.path.relpath(os.path.join(inputDirectory, file_r2)) + " \\\n") if readFilesCommand == "zcat" or readFilesCommand == "gunzip -c" or readFilesCommand == "bunzip -c": script.write("--readFilesCommand " + readFilesCommand + " \\\n") script.write(
os.makedirs(outputDirectory) if stranded: strands = ["", "_positive", "_negative"] else: strands = [""] # Cycle through all the conditions and write the meanbedgraphs scripts. for condition in unique_conditions: samples = samplesFile[samplesFile.condition == condition]["sample"].tolist() # Create script file. for strand in strands: scriptName = 'meanbedgraphs_' + condition + strand + '.sh' script = open(scriptName, 'w') util.writeHeader(script, config, "meanbedgraphs") if sort: for sample in samples: script.write("sort -k 1,1 -k2,2n " + "\\\n") script.write( os.path.relpath( os.path.join(inputDirectory, sample + strand + ".bedgraph")) + " \\\n") script.write("--output " + os.path.relpath( os.path.join(inputDirectory, sample + strand + ".bedgraph")) + " \\\n") script.write("&> " + scriptName + ".log") script.write("\n\n") script.write("meanbedgraphs_computation.py " + "\\\n") script.write("--bedgraphs " + "\\\n") for sample in samples[:-1]:
if not os.path.exists(scriptsDirectory): os.mkdir(scriptsDirectory) os.chdir(scriptsDirectory) # Create output directory, if it does not exist yet. if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) ######################### # htseqcount.sh scripts # ######################### for sample in samples: scriptName = "htseqcount_" + sample + ".sh" script = open(scriptName, "w") if header: util.writeHeader(script, config, "htseqcount") script.write( "#Deactivate Python 3 virtual environment, and activate Python 2 virtual environment to be able to use TopHat.\n" ) virtual_env_directory = os.path.dirname(os.environ["VIRTUAL_ENV"]) script.write( "source " + os.path.join(virtual_env_directory, "python2.7/bin/activate") + "\n\n") script.write("samtools sort -n" + " \\\n") script.write("-o " + os.path.relpath( os.path.join(inputDirectory, sample, sample + "_sorted_by_read_name" + extension)) + " \\\n") script.write( os.path.relpath( os.path.join(inputDirectory, sample, sample + extension)) + " \\\n")
samples = util.getMergedsamples() samples = sorted( samples, reverse=True ) # To always put wt first, put the list in reverse alpabetical order # Create scripts directory, if it does not exist yet, and cd to it. if not os.path.exists(scriptsDirectory): os.mkdir(scriptsDirectory) os.chdir(scriptsDirectory) # Create output directory, if it does not exist yet. if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) for sample in samples: # Create script scriptName = "samtools_view_" + sample + ".sh" script = open(scriptName, 'w') if header: util.writeHeader(script, config, "samtoolsIndex") script.write("samtools view -h " + "\\\n") inputFile = glob.glob(inputDirectory + "/" + sample + "/*" + extension)[0] script.write(inputFile + " \\\n") script.write(region + " \\\n") #script.write("1> " + outputDirectory + "/" + sample + "/" + sample + "_filtered_" + extension + " \\\n") script.write("1> " + outputDirectory + "/" + sample + "/" + sample + "_ERCC_only.bam" + " \\\n") script.write("2> " + scriptName + ".log") script.close()
os.makedirs(outputDirectory) # List comparison. Normally, should be names of directories in input directory comparisons = os.listdir(inputDirectory) # Annotate all the peaks called by macs callpeaks for comparison in comparisons: inputDirectory_comparison = os.path.join(inputDirectory, comparison) # Create output directories outputDirectory_comparison = os.path.join(outputDirectory, comparison) if not os.path.exists(outputDirectory_comparison): os.makedirs(outputDirectory_comparison) # Create script file scriptName = 'findMotifs_' + comparison + '.sh' script = open(scriptName, 'w') util.writeHeader(script, config, "findMotifs") script.write("findMotifsGenome.pl " + "\\\n") if peaks == "narrow": script.write( os.path.join(inputDirectory_comparison, comparison + "_peaks.narrowPeak") + " \\\n") if peaks == "broad": script.write( os.path.join(inputDirectory_comparison, comparison + "_peaks.broadPeak") + " \\\n") script.write(genome + " \\\n") script.write(os.path.join(outputDirectory_comparison) + " \\\n") script.write("-size 150 " + "\\\n") script.write("&> " + scriptName + ".log") script.close()
if not os.path.exists(scriptsDirectory): os.mkdir(scriptsDirectory) os.chdir(scriptsDirectory) # Create output directory, if it does not exist yet. if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) # Store the list of files with the extensions fastq or fastq.gz files = glob.glob(inputDirectory + "/*.fastq") + glob.glob(inputDirectory + "/*.fastq.gz") # Create script files. for file in files: file = os.path.basename(file) scriptName = 'trimfastq_' + file + '.sh' script = open(scriptName, 'w') if header: util.writeHeader(script, config, "trimfastq") script.write("#Deactivate Python 3 virtual environment, and activate Python 2 virtual environment to be able to use TopHat.\n") virtual_env_directory = os.path.dirname(os.environ["VIRTUAL_ENV"]) script.write("source " + os.path.join(virtual_env_directory, "python2.7/bin/activate") + "\n\n") script.write("trimFastq.py" + " \\\n") script.write(os.path.relpath(os.path.join(inputDirectory, file)) + " \\\n") script.write(os.path.relpath(os.path.join(outputDirectory, file)) + " \\\n") script.write(minlength + " \\\n") script.write("&> " + scriptName + ".log") script.close() if (args.submitJobsToQueue.lower() == "yes") | (args.submitJobsToQueue.lower() == "y"): subprocess.call("submitJobs.py", shell=True)
samples = pandas.read_csv("../../scripts/samples.txt", sep="\t") for index, row in samples.iterrows(): # Create directories if not os.path.exists(row["sample"]): os.mkdir(row["sample"]) os.chdir(row["sample"]) # Symbolic links to FASTQ files subprocess.call("ln -s " + os.path.join(inputDirectory, row["file_r1"][:-3]), shell=True) # Mapper script mapper_script = open("mapper.sh", "w") if header: util.writeHeader(mapper_script, config, "mirdeep") mapper_script.write("mapper.pl \\\n") mapper_script.write("*.fastq \\\n") mapper_script.write("-h -n -o 4 -e -m -v \\\n") mapper_script.write("-p " + bowtieIndex + " \\\n") mapper_script.write("-s " + row["sample"] + "_collapsed.fa \\\n") mapper_script.write("-t " + row["sample"] + "_collapsed_vs_genome_" + genome + ".arf" + " \\\n") mapper_script.write("1> mapper.sh_output \\\n") mapper_script.write("2> mapper.sh_error") mapper_script.write("\n\n") mapper_script.close() if (args.submitJobsToQueue.lower() == "yes") | (args.submitJobsToQueue.lower() == "y"): subprocess.call("submitJobs.py", shell=True) # Mirdeep script
# Read input file. samplesFile = pandas.read_csv(samplesFile, sep="\t") # Create scripts directory, if it does not exist yet, and cd to it. if not os.path.exists(scriptsDirectory): os.mkdir(scriptsDirectory) os.chdir(scriptsDirectory) # Create output directory, if it does not exist yet. if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) # Cycle through all the samples and write the star scripts. for index, row in samplesFile.iterrows(): run = row["Run_s"] # Create script file. scriptName = "getsra_" + run + ".sh" script = open(scriptName, 'w') if header: util.writeHeader(script, config, "getsra") script.write("wget" + " \\\n") script.write( "ftp://ftp.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/" + os.path.join(run[0:6], run, run + ".sra") + " \\\n") script.write("&> " + scriptName + ".log") if (args.submitJobsToQueue.lower() == "yes") | (args.submitJobsToQueue.lower() == "y"): subprocess.call("submitJobs.py", shell=True)
samplesFile = util.readSamplesFile() samples = samplesFile["sample"] # Create script and output directories, if they do not exist yet. util.makeDirectory(outputDirectory) util.makeDirectory(scriptsDirectory) # CD to scripts directory os.chdir(scriptsDirectory) # Write scripts for sample in samples: scriptName = "collectinsertsizemetrics_" + sample + ".sh" script = open(scriptName, "w") if header: util.writeHeader(script, config, "collectinsertsizemetrics") # Reorder script.write("java -Xmx4g -jar " + os.path.join(picard_folder, "CollectInsertSizeMetrics.jar") + " \\\n") script.write("VALIDATION_STRINGENCY=LENIENT " + "\\\n") script.write("HISTOGRAM_FILE=" + os.path.join(outputDirectory, sample + "_picard_insert_size_plot.pdf") + " \\\n") script.write("INPUT=" + os.path.join(inputDirectory, sample + ".filtered.bam") + " \\\n") script.write("OUTPUT=" + os.path.join(outputDirectory, sample + "_picard_insert_size_metrics.txt") + " \\\n")
# Create output directory, if it does not exist yet. if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) ############################ # featurecounts.sh scripts # ############################ for index, row in samplesFile.iterrows(): sample = row["sample"] if "lane" in samplesFile.columns: sample += "_" + row["lane"] scriptName = "featurecounts_" + sample + ".sh" script = open(scriptName, "w") if header: util.writeHeader(script, config, "featurecounts") script.write("featureCounts \\\n") script.write("-T " + T + " \\\n") if p: script.write("-p" + " \\\n") if B: script.write("-B" + " \\\n") if C: script.write("-C" + " \\\n") script.write("-s " + s + " \\\n") if M: script.write("-M" + " \\\n") if O: script.write("-O" + " \\\n") script.write("-a " + gtfFile + " \\\n")
samples = util.getMergedsamples() # Create script and output directories, if they do not exist yet. util.makeDirectory(outputDirectory) util.makeDirectory(scriptsDirectory) # CD to scripts directory os.chdir(scriptsDirectory) # Write scripts for sample in samples: scriptName = "target_interval_" + sample + ".sh" script = open(scriptName, "w") if header: util.writeHeader(script, config, "target_interval") # Reorder script.write("samtools view -H" + " \\\n") script.write( os.path.join(inputDirectory, sample + ".filtered.bam") + " \\\n") script.write("1> " + os.path.join(outputDirectory, sample + "_bam_header.txt") + " \\\n") script.write("2> " + scriptName + ".log") script.write("\n\n") script.write("cat " + target_BED + " | " + "\\\n") script.write( "gawk '{print $1 \"\\t\" $2+1 \"\\t\" $3 \"\\t+\\tinterval_\" NR}' " +
genomeFile = config.get(genome, "genomeFile") samples = util.getMergedsamples() # Create script and output directories, if they do not exist yet. util.makeDirectory(outputDirectory) util.makeDirectory(scriptsDirectory) # CD to scripts directory os.chdir(scriptsDirectory) # Write scripts for sample in samples: scriptName = "bsmap_methratio_" + sample + ".sh" script = open(scriptName, "w") if header: util.writeHeader(script, config, "bsmap_methratio") # Reorder script.write("java -Xmx4g -Xms4g -jar " + os.path.join(picard_folder, "CalculateHsMetrics.jar") + " \\\n") script.write("BAIT_INTERVALS=" + os.path.join("../../results/bait_intervals", sample + "_design_bait_intervals.txt") + " \\\n") script.write("TARGET_INTERVALS=" + os.path.join("../../results/target_intervals", sample + "_design_target_intervals.txt") + " \\\n") script.write("INPUT=" + os.path.join(inputDirectory, sample + ".filtered.bam") + " \\\n") script.write("OUTPUT=" + os.path.join(outputDirectory, sample + "_picard_hs_metrics.txt") + " \\\n") script.write("METRIC_ACCUMULATION_LEVEL=ALL_READS " + "\\\n") script.write("REFERENCE_SEQUENCE=" + genomeFile + " \\\n") script.write("VALIDATION_STRINGENCY=LENIENT " + "\\\n") script.write("&> " + scriptName + ".log") script.close()
OrderedDict.fromkeys(conditions)) # Remove duplicates. # Create scripts directory, if it does not exist yet, and cd to it. if not os.path.exists(scriptsDirectory): os.mkdir(scriptsDirectory) os.chdir(scriptsDirectory) # Create output directory, if it does not exist yet. if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) # Write the cuffdiff script. scriptName = "cuffdiff.sh" script = open(scriptName, 'w') if header: util.writeHeader(script, config, "cuffdiff") script.write("cuffdiff" + " \\\n") script.write("--labels ") script.write(",".join(unique_conditions) + " \\\n") script.write("-p " + processors + " \\\n") script.write("--no-effective-length-correction " + "\\\n") if stranded: script.write("--library-type fr-firststrand" + " \\\n") script.write("-u -b " + genomeFile + " \\\n") script.write("-o " + os.path.relpath(os.path.join(outputDirectory)) + " \\\n") script.write(gtfFile + " \\\n") script.write( os.path.relpath( os.path.join(inputDirectory, samples[0], "accepted_hits.bam"))) previous_condition = conditions[0] for sample, condition in zip(samples[1:], conditions[1:]):
os.makedirs(os.path.join(outputDirectory, "too_long_after_trimming")) # Store the list of files with the extensions fastq or fastq.gz files = glob.glob(inputDirectory + "/*.fastq") + glob.glob(inputDirectory + "/*.fastq.gz") files.sort() # Write the script(s) # Cycle through all the R1 files. for file in files: fileR1 = os.path.basename(file) # Create script file. scriptName = 'cutadapt_' + fileR1 + '.sh' script = open(scriptName, 'w') if header: util.writeHeader(script, config, "cutadapt") script.write("cutadapt" + " \\\n") if not adapter == "None": script.write("--adapter " + adapter + " \\\n") if not minlength == "None": script.write("--minimum-length " + minlength + " \\\n") if not maxlength == "None": script.write("--maximum-length " + maxlength + " \\\n") if not qualitycutoff == "None": script.write("--quality-cutoff " + qualitycutoff + " \\\n") if trimn: script.write("--trim-n" + " \\\n") if not cut == "None": script.write("--cut " + cut + " \\\n") if (args.gzip.lower() == "no") | (args.gzip.lower() == "n"): script.write(
# Read samples file. samplesDataFrame = util.readSamplesFile() # Create scripts directory, if it does not exist yet, and cd to it. if not os.path.exists(scriptsDirectory): os.mkdir(scriptsDirectory) os.chdir(scriptsDirectory) # Cycle through all the samples and write the cellrangercount scripts. for index, row in samplesDataFrame.iterrows(): sample = row["sample"] # Create script file. scriptName = "cellrangercount_" + sample + ".sh" script = open(scriptName, 'w') if header: util.writeHeader(script, config, "cellrangercount") script.write("cellranger count " + "\\\n") script.write("--localcores=" + localcores + " \\\n") script.write("--localmem=" + localmem + " \\\n") script.write("--id=" + sample + " \\\n") script.write("--fastqs " + os.path.relpath(os.path.join(inputDirectory)) + " \\\n") script.write("--sample=" + sample + " \\\n") script.write("--transcriptome=" + cellrangerTranscriptome + " \\\n") script.write("&> " + scriptName + ".log") if (args.submitJobsToQueue.lower() == "yes") | (args.submitJobsToQueue.lower() == "y"): subprocess.call("submitJobs.py", shell=True)
fields = file.split(".") samples.append(".".join(fields[-3:])) # Remove duplicates, and sort lanes. samples = sorted(list(set(samples))) all_samples_grouped_by_lane = [] for sample in samples: one_sample_grouped_by_lane = sorted( glob.glob(inputDirectory + "/*" + sample)) all_samples_grouped_by_lane.append(one_sample_grouped_by_lane) # Write the script scriptName = 'catFASTQFiles.sh' script = open(scriptName, 'w') if header: util.writeHeader(script, config, "catFASTQFiles") for sample, one_sample_grouped_by_lane in zip(samples, all_samples_grouped_by_lane): script.write("cat " + "\\\n") script.write(" \\\n".join(one_sample_grouped_by_lane) + " \\\n") script.write("1> " + os.path.relpath(os.path.join(outputDirectory, sample)) + " \\\n") script.write("2>> " + scriptName + ".log") script.write("\n\n") script.close() if (args.submitJobsToQueue.lower() == "yes") | (args.submitJobsToQueue.lower() == "y"):
# Create output directory, if it does not exist yet. if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) ############################ # dexseqcounts.sh scripts # ############################ for index, row in samplesFile.iterrows(): sample = row["sample"] if "Lane" in samplesFile.columns: sample = sample + "_lane_" + str(row["lane"]) scriptName = "dexseqcounts_" + sample + ".sh" script = open(scriptName, "w") if header: util.writeHeader(script, config, "dexseqcounts") script.write("source " + os.path.join( toolsFolder, "python_environments/python2.7/bin/activate")) script.write("\n\n") script.write("dexseq_count.py" + " \\\n") script.write("--paired=yes" + " \\\n") if stranded: script.write("--stranded=reverse" + " \\\n") else: script.write("--stranded=no" + " \\\n") script.write("--format=bam" + " \\\n") script.write("--order=pos" + " \\\n") script.write(dexseq_gtfFile + " \\\n") script.write( os.path.relpath( os.path.join(inputDirectory, sample, "accepted_hits.bam")) +