def TrimGalore(TRIM_GALORE_EXE, CUTADAPT_BIN, OutDir, FastQFile1, SysOutFile, FastQFile2='', ZipOutput=False, TrimGaloreOptions=''): uF.makedir(OutDir) uF.makedir(dirname(SysOutFile)) if exists(FastQFile2): if TrimGaloreOptions.find('--paired') == -1: TrimGaloreOptions += ' --paired' zipParam = ' --dont_gzip ' if ZipOutput: zipParam = ' --gzip ' if TrimGaloreOptions.find(zipParam) == -1: TrimGaloreOptions += zipParam Command = "%s --path_to_cutadapt %scutadapt --output_dir %s %s %s >> %s 2>&1" % ( TRIM_GALORE_EXE, CUTADAPT_BIN, OutDir, TrimGaloreOptions, ' '.join( [x for x in [FastQFile1, FastQFile2] if exists(x)]), SysOutFile) print Command system(Command) if exists(SysOutFile): with open(SysOutFile, 'a') as f: f.write('\n\n%s\n\n' % (Command)) return Command
def rsemCalculateExpression(RSEM_BIN, FastQFiles1, RSEMTranscriptIndex, OutPrefix, SysOutFile, FastQFiles2=[], NumThreads=4, RSEMCalcExprParams=''): OutDir = dirname(OutPrefix) tTMPDir = join(OutDir, 'rsem_tmp/') TMPDir = join(tTMPDir, '%s/' % (basename(OutPrefix))) uF.makedir(tTMPDir) uF.makedir(dirname(SysOutFile)) Command = """%srsem-calculate-expression --num-threads %s --temporary-folder %s %s %s %s %s %s > %s 2>&1""" % ( RSEM_BIN, NumThreads, TMPDir, RSEMCalcExprParams, ','.join([ x for x in FastQFiles1 if exists(x) ]), ','.join([x for x in FastQFiles2 if exists(x) ]), RSEMTranscriptIndex, OutPrefix, SysOutFile) print Command system(Command) if exists(SysOutFile): with open(SysOutFile, 'a') as f: f.write('\n\n%s\n\n' % (Command)) return Command
def STARAlignReads(STAR_BIN, FastQFiles1, GenomeIndexDir, OutPrefix, SysOutFile, FastQFiles2=[], NumThreads=4, STARAlignReadsParams=''): OutDir = dirname(OutPrefix) tTMPDir = join(OutDir, 'star_tmp/') TMPDir = join(tTMPDir, '%s/' % (basename(OutPrefix).strip('.'))) uF.makedir(tTMPDir) uF.makedir(dirname(SysOutFile)) Command = """%sSTAR --runMode alignReads --readFilesIn %s %s --genomeDir %s --outFileNamePrefix %s --outTmpDir %s --runThreadN %s %s >> %s 2>&1""" % ( STAR_BIN, ','.join([x for x in FastQFiles1 if exists(x)]), ','.join( [x for x in FastQFiles2 if exists(x)]), GenomeIndexDir, OutPrefix, TMPDir, NumThreads, STARAlignReadsParams, SysOutFile) print Command system(Command) if exists(SysOutFile): with open(SysOutFile, 'a') as f: f.write('\n\n%s\n\n' % (Command)) return Command
def AddOrReplaceReadGroups(JAVA_EXE, JAVA_PARAMS, PICARD_EXE, BAMFileIn, BAMFileOut, RGSampleName, SysOutFile, RGID=1, Library=1, Platform='illumina', PlatformUnit=1, SeqCentre='null', Description='null', RunDate='null', SORT_ORDER='null', PicardParams=''): uF.makedir(dirname(BAMFileOut)) uF.makedir(dirname(SysOutFile)) Command = "%s %s -jar %s AddOrReplaceReadGroups INPUT=%s OUTPUT=%s SORT_ORDER=%s RGID=%s RGLB=%s RGPL=%s RGPU=%s RGSM=%s RGCN=%s RGDS=%s RGDT=%s %s >> %s 2>&1" % ( JAVA_EXE, JAVA_PARAMS, PICARD_EXE, BAMFileIn, BAMFileOut, SORT_ORDER, RGID, Library, Platform, PlatformUnit, RGSampleName, SeqCentre, Description, RunDate, PicardParams, SysOutFile) print Command system(Command) if exists(SysOutFile): with open(SysOutFile, 'a') as f: f.write('\n\n%s\n\n' % (Command)) return Command
def CutAdapt(CUTADAPT_BIN, OutPrefix, FastQFile1, SysOutFile, FastQFile2='', ZipOutput=False, CutAdaptOptions=''): uF.makedir(dirname(OutPrefix)) uF.makedir(dirname(SysOutFile)) FastQOutFile1 = OutPrefix.strip() + '_1.fastq' FastQOutFile2 = OutPrefix.strip() + '_2.fastq' if ZipOutput: FastQOutFile1 += '.gz' FastQOutFile2 += '.gz' FastQFiles = [FastQFile1] if exists(FastQFile2): if CutAdaptOptions.find('--paired-output') == -1: CutAdaptOptions += ' --paired-output=%s' % (FastQOutFile2) FastQFiles.append(FastQFile2) else: FastQOutFile1 = OutPrefix.strip() + '.fastq' if ZipOutput: FastQOutFile1 += '.gz' Command = '%scutadapt --output=%s %s %s >> %s 2>&1' % ( CUTADAPT_BIN, FastQOutFile1, CutAdaptOptions, ' '.join( [x for x in [FastQFile1, FastQFile2] if exists(x)]), SysOutFile) print Command system(Command) if exists(SysOutFile): with open(SysOutFile, 'a') as f: f.write('\n\n%s\n\n' % (Command)) return Command
def flagstat(SAMTOOLS_EXE, BAMFile, FlagStatFile): uF.makedir(dirname(FlagStatFile)) Command = "%s flagstat %s > %s" % (SAMTOOLS_EXE, BAMFile, FlagStatFile) print Command system(Command) return Command
def rsemPlotModel(RSEM_BIN, RSEMFileOutPrefix, PDFPlotFile): uF.makedir(dirname(PDFPlotFile)) Command = """%srsem-plot-model %s %s""" % (RSEM_BIN, RSEMFileOutPrefix, PDFPlotFile) print Command system(Command) return Command
def rsemExtractTranscriptsFromGTF(RSEM_BIN, GTFFile, GenomeFasta, OutPrefix): uF.makedir(dirname(OutPrefix)) Command = """%srsem-extract-reference-transcripts %s 0 %s None 0 %s""" % ( RSEM_BIN, OutPrefix, GTFFile, GenomeFasta) print Command system(Command) return Command
def SortBAMByCoordinate(SAMTOOLS_EXE, BAMFileIn, BAMFileOut, NumThreads=4): if NumThreads > 6: NumThreads = 6 uF.makedir(dirname(BAMFileOut)) Command = """%s sort -@ %s %s %s""" % (SAMTOOLS_EXE, NumThreads, BAMFileIn, BAMFileOut[:-4]) print Command system(Command) return Command
def StarGenomeIndex(STAR_BIN, GenomeFasta, OutDir, NumThreads=4, StarGenomeIndexParams=''): uF.makedir(OutDir) Command = """%sSTAR --runMode genomeGenerate --runThreadN %s --genomeFastaFiles %s --genomeDir %s %s""" % ( STAR_BIN, NumThreads, GenomeFasta, OutDir, StarGenomeIndexParams) print Command system(Command) return Command
def CreateSequenceDictionary(JAVA_EXE, JAVA_PARAMS, PICARD_EXE, GenomeFasta, DictFile, PicardParams=''): uF.makedir(dirname(DictFile)) Command = "%s %s -jar %s CreateSequenceDictionary REFERENCE=%s OUTPUT=%s %s" % ( JAVA_EXE, JAVA_PARAMS, PICARD_EXE, GenomeFasta, DictFile, PicardParams) print Command system(Command) return Command
def FastQC(FASTQC_EXE, JAVA_EXE, InputFile, OutDir, FileFormat='fastq', NumThreads=6): uF.makedir(OutDir) Command = "%s -q --extract --outdir %s -f %s -t %s --java %s %s" % ( FASTQC_EXE, OutDir, FileFormat, NumThreads, JAVA_EXE, InputFile) print Command system(Command) return Command
def rsemPrepareReference(RSEM_BIN, GenomeFasta, GTFFile, RSEMTranscriptIndex, NumThreads=4, RSEMPrepRefParams=''): uF.makedir(dirname(RSEMTranscriptIndex)) Command = """%srsem-prepare-reference %s --gtf %s --num-threads %s %s %s """ % ( RSEM_BIN, RSEMPrepRefParams, GTFFile, NumThreads, GenomeFasta, RSEMTranscriptIndex) print Command system(Command) return Command
def FastQScreen(FASTQ_SCREEN_EXE, InputFile, OutDir, ConfigFile, SysOutFile, Subset=200000, NumThreads=6): uF.makedir(OutDir) uF.makedir(dirname(SysOutFile)) Command = "%s --outdir %s --subset %s --conf %s --threads %s --aligner bowtie2 %s > %s 2>&1" % ( FASTQ_SCREEN_EXE, OutDir, Subset, ConfigFile, NumThreads, InputFile, SysOutFile) print Command system(Command) return Command
def CollectMultipleMetrics(JAVA_EXE, JAVA_PARAMS, PICARD_EXE, BAMFile, GenomeFasta, MetricsFile, SysOutFile, PicardParams=''): uF.makedir(dirname(MetricsFile)) uF.makedir(dirname(SysOutFile)) Command = "%s %s -jar %s CollectMultipleMetrics INPUT=%s OUTPUT=%s REFERENCE_SEQUENCE=%s %s >> %s 2>&1" % ( JAVA_EXE, JAVA_PARAMS, PICARD_EXE, BAMFile, MetricsFile, GenomeFasta, PicardParams, SysOutFile) print Command system(Command) if exists(SysOutFile): with open(SysOutFile, 'a') as f: f.write('\n\n%s\n\n' % (Command)) return Command
def MarkDuplicates(JAVA_EXE, JAVA_PARAMS, PICARD_EXE, BAMFile, DeDupBAMFile, MetricsFile, SysOutFile, rmDups=False, PicardParams=''): uF.makedir(dirname(DeDupBAMFile)) uF.makedir(dirname(MetricsFile)) uF.makedir(dirname(SysOutFile)) Command = "%s %s -jar %s MarkDuplicates INPUT=%s OUTPUT=%s METRICS_FILE=%s %s" % ( JAVA_EXE, JAVA_PARAMS, PICARD_EXE, BAMFile, DeDupBAMFile, MetricsFile, PicardParams) if rmDups: Command += " REMOVE_DUPLICATES=true >> %s 2>&1" % (SysOutFile) else: Command += " REMOVE_DUPLICATES=false >> %s 2>&1" % (SysOutFile) print Command system(Command) if exists(SysOutFile): with open(SysOutFile, 'a') as f: f.write('\n\n%s\n\n' % (Command)) return Command
def RNASeqC(JAVA17_EXE, JAVA_PARAMS, RNASEQC_EXE, SamplesFile, OutDir, GenomeFasta, GTFFile, SysOutFile, RNASeqCParams=''): uF.makedir(OutDir) uF.makedir(dirname(SysOutFile)) Command = """%s %s -jar %s -s %s -o %s -r %s -t %s -gatkFlags "-S SILENT -U ALLOW_SEQ_DICT_INCOMPATIBILITY" %s > %s 2>&1 """ % ( JAVA17_EXE, JAVA_PARAMS, RNASEQC_EXE, SamplesFile, OutDir, GenomeFasta, GTFFile, RNASeqCParams, SysOutFile) print Command system(Command) if exists(SysOutFile): with open(SysOutFile, 'a') as f: f.write('\n\n%s\n\n' % (Command)) return Command
def CollectInsertSizeMetrics(JAVA_EXE, JAVA_PARAMS, PICARD_EXE, BAMFileIn, MetricsFileOut, HistogramFile, SysOutFile, PicardParams=''): uF.makedir(dirname(MetricsFileOut)) uF.makedir(dirname(HistogramFile)) uF.makedir(dirname(SysOutFile)) Command = "%s %s -jar %s CollectInsertSizeMetrics INPUT=%s OUTPUT=%s HISTOGRAM_FILE=%s %s >> %s 2>&1" % ( JAVA_EXE, JAVA_PARAMS, PICARD_EXE, BAMFileIn, MetricsFileOut, HistogramFile, PicardParams, SysOutFile) print Command system(Command) if exists(SysOutFile): with open(SysOutFile, 'a') as f: f.write('\n\n%s\n\n' % (Command)) return Command
############################################ ############################################ if args.SINGLE_END: if RNASEQC_PARAMS.find('-singleEnd') == -1: RNASEQC_PARAMS += ' -singleEnd ' SAMPLES_FILE = join(args.OUTDIR, 'samples.txt') REPORT_HTML_FILE = join(args.OUTDIR, 'report.html') RNASeqC_SysOutFile = join(args.OUTDIR, 'rnaseqc.sysout') CommandList = [] if not exists(SAMPLES_FILE) and not exists(REPORT_HTML_FILE): ## CREATE SAMPLES FILE uF.makedir(args.OUTDIR) fout = open(SAMPLES_FILE, 'w') fout.write('\t'.join(['Sample ID', 'Bam File', 'Notes']) + '\n') for BAMFile in [x.strip() for x in args.BAM_FILES.split(',')]: if exists(BAMFile): fout.write('\t'.join([basename(BAMFile)[:-4], BAMFile, 'NONE']) + '\n') fout.close() if exists(args.RIBOSOMAL_LIST_FILE) and RNASEQC_PARAMS.find('-rRNA') == -1: RNASEQC_PARAMS += ' -rRNA %s' % (args.RIBOSOMAL_LIST_FILE) print '%s: %s' % (strftime("%d-%m-%Y %H:%M:%S", gmtime()), 'Running RNASeqC.') Command = tW.RNASeqC(JAVA17_EXE=JAVA_17_EXE, JAVA_PARAMS=JAVA_PARAMS, RNASEQC_EXE=RNASEQC_EXE,
## if NUM_READS_IN_FASTQ != -1: ## print '%s: %s' % (strftime("%d-%m-%Y %H:%M:%S", gmtime()),'Counting number of reads in FASTQ file.') ## NUM_READS_IN_FASTQ = uF.numLinesInFile(args.FASTQ_FILE1) ## ## print '%s: %s' % (strftime("%d-%m-%Y %H:%M:%S", gmtime()),'Verifying reads in MarkDuplicates BAM with flagstat output.') ## isValidBAM = uF.flagStatSTARGenomeBAMValidate(FlagStatFile=GENOME_MARKDUP_SORTED_BAM_FLAGSTAT_FILE,NumReadsInFastQ=NUM_READS_IN_FASTQ,isPairedEnd=isPairedEnd) ## Command = 'touch %s' % (join('%s.fail' % (GENOME_MARKDUP_SORTED_BAM_FLAGSTAT_FILE))) ## if isValidBAM: ## Command = 'touch %s' % (join('%s.pass' % (GENOME_MARKDUP_SORTED_BAM_FLAGSTAT_FILE))) ## system(Command) ## CommandList.append('%s\n%s' % (strftime("%d-%m-%Y %H:%M:%S", gmtime()),Command)) ############################################ ############################################ ## WRITE COMMAND FILE ## ############################################ ############################################ if len(CommandList) != 0: CompleteFile = join(OUTDIR, 'complete/', '%s.runStar.complete' % (basename(args.OUTPREFIX))) uF.makedir(dirname(CompleteFile)) fout = open(CompleteFile, 'w') fout.write('\n' + '\n\n'.join(CommandList) + '\n') fout.close() ############################################## ############################################## ############################################## ##############################################
############################################ ## FILTER GTF FILE ## ############################################ if exists(args.FAI_IN): ChromIDs = [] fin = open(args.FAI_IN, 'r') for line in fin.readlines(): lspl = [x.strip() for x in line.strip().split('\t')] ChromIDs.append(lspl[0]) fin.close() if exists(args.GTF_IN): uF.makedir(dirname(args.GTF_OUT)) fin = open(args.GTF_IN, 'r') fout = open(args.GTF_OUT, 'w') while True: line = fin.readline() if line: if not line[0] == '#': lspl = line.strip().split('\t') if lspl[0] in ChromIDs: fout.write(line) else: fout.write(line) else: fin.close() fout.close() break
############################################ ## RSEM STAR ## ############################################ ############################################ RSEMStarIndexDir = join(args.OUTDIR, 'rsem_star/readLen%s/' % (args.STAR_SJDBOVERHANG + 1)) RSEMStarExonGTF = join(RSEMStarIndexDir, '%s.exon.gtf' % (args.PREFIX)) RSEMStarTranscriptIndex = join(RSEMStarIndexDir, '%s' % (args.PREFIX)) RSEMStarTranscriptsFasta = join('%s.transcripts.fa' % (RSEMStarTranscriptIndex)) RSEMStarDictFile = join('%s.transcripts.dict' % (RSEMStarTranscriptIndex)) if not exists(join(RSEMStarIndexDir, 'sjdbInfo.txt')): uF.makedir(RSEMStarIndexDir) chdir(RSEMStarIndexDir) print '%s: %s' % (strftime("%d-%m-%Y %H:%M:%S", gmtime()), 'Filtering GTF file for exon attributes.') Command = """awk '$3 == "exon"' %s > %s""" % (args.GTF_FILE, RSEMStarExonGTF) system(Command) CommandList.append('%s\n%s' % (strftime("%d-%m-%Y %H:%M:%S", gmtime()), Command)) print '%s: %s' % (strftime("%d-%m-%Y %H:%M:%S", gmtime()), 'Creating RSEM Star Index.') ## Command = tW.rsemPrepareReference(RSEM_BIN=RSEM_BIN,GenomeFasta=args.GENOME_FASTA_FILE,GTFFile=RSEMStarExonGTF,RSEMTranscriptIndex=RSEMStarTranscriptIndex,NumThreads=args.NUM_THREADS,RSEMPrepRefParams='--star --star-path %s --star-sjdboverhang %s --polyA --polyA-length 125' % (STAR_BIN,args.STAR_SJDBOVERHANG)) Command = tW.rsemPrepareReference( RSEM_BIN=RSEM_BIN,
## DOWNLOAD & PREPARE FASTA ## ############################################ ############################################ CommandList = [] SPECIES_NAME = args.SPECIES_NAME.lower() SPECIES_NAME = SPECIES_NAME[0].upper() + SPECIES_NAME[1:] FASTA_DIR = join(args.OUTDIR, args.NCBI_BUILD, 'release-%s/fa/' % (args.ENSEMBL_RELEASE)) FASTA_FILE = join(FASTA_DIR, '%s.fa' % (args.PREFIX)) if not exists(FASTA_FILE): uF.makedir(FASTA_DIR) chdir(FASTA_DIR) ENSEMBL_FASTA_FTP = 'ftp://ftp.ensembl.org/pub/release-%s/fasta/%s/dna' % ( args.ENSEMBL_RELEASE, SPECIES_NAME.lower()) ENSEMBL_FASTA = '%s.%s.dna.toplevel.fa' % (SPECIES_NAME, args.NCBI_BUILD) if args.NCBI_BUILD in ['GRCh38', 'GRCm38']: ENSEMBL_FASTA = '%s.%s.dna.primary_assembly.fa' % (SPECIES_NAME, args.NCBI_BUILD) elif args.NCBI_BUILD in ['GRCh37']: ENSEMBL_FASTA = '%s.%s.%s.dna.primary_assembly.fa' % ( SPECIES_NAME, args.NCBI_BUILD, args.ENSEMBL_RELEASE) elif args.NCBI_BUILD in ['NCBIM37']: ENSEMBL_FASTA = '%s.%s.%s.dna.toplevel.fa' % ( SPECIES_NAME, args.NCBI_BUILD, args.ENSEMBL_RELEASE) if args.FASTA_FTP_DOWNLOAD_LINK != 'NA':
'sysout/', '%s.fastqscreen.sysout' % (args.SAMPLE_PREFIX)) FastQScreenOutPrefix = splitext(basename(args.FASTQ_FILE))[0] if args.FASTQ_FILE[-3:] == '.gz': FastQScreenOutPrefix = splitext(splitext(basename(args.FASTQ_FILE))[0])[0] FastQScreenPngFile = join(args.OUTDIR, 'fastq_screen/', args.SAMPLE_PREFIX, '%s_screen.png' % (FastQScreenOutPrefix)) FastQScreenTxtFile = join(args.OUTDIR, 'fastq_screen/', args.SAMPLE_PREFIX, '%s_screen.txt' % (FastQScreenOutPrefix)) if exists(args.FASTQ_FILE): CommandList = [] if not args.SKIP_FASTQC: if not exists(FASTQC_HTML_FILE): uF.makedir(FASTQC_DIR) print '%s: %s' % (strftime("%d-%m-%Y %H:%M:%S", gmtime()), 'Running FastQC.') Command = tW.FastQC(FASTQC_EXE=FASTQC_EXE, JAVA_EXE=JAVA_18_EXE, InputFile=args.FASTQ_FILE, OutDir=FASTQC_DIR, FileFormat='fastq', NumThreads=args.NUM_THREADS) CommandList.append( '%s\n%s' % (strftime("%d-%m-%Y %H:%M:%S", gmtime()), Command)) if exists(FASTQC_ZIP_FILE): print '%s: %s' % (strftime("%d-%m-%Y %H:%M:%S", gmtime()), 'Deleting FastQC ZIP file.') Command = 'rm %s' % (FASTQC_ZIP_FILE)
## GENERATE COMMANDS FOR PIPELINE ## #################################################################################### #################################################################################### if not exists(CompleteFile): sampleCommandDict[sampleID] = dict([(x,[]) for x in commandGroupList]) ############################################ ############################################ ## CREATE SOFT-LINK TO RAW FASTQ FILE(S) ## ############################################ ############################################ if not exists(sampleFileDict[sampleID]['RAW_FASTQ_FILE1']): uF.makedir(RawFastQDir) Command = 'ln -s %s %s' % (sampleDesignDict[sampleID]['fastq_file1'],sampleFileDict[sampleID]['RAW_FASTQ_FILE1']) sampleCommandDict[sampleID]['PREP'] += [Command] if sampleDesignDict[sampleID]['isPaired'] and not exists(sampleFileDict[sampleID]['RAW_FASTQ_FILE2']): Command = 'ln -s %s %s' % (sampleDesignDict[sampleID]['fastq_file2'],sampleFileDict[sampleID]['RAW_FASTQ_FILE2']) sampleCommandDict[sampleID]['PREP'] += [Command] ############################################ ############################################ ## GENERATE SAMPLED FASTQ FILE(S) ## ############################################ ############################################ SampledFastQCommand = '%sbin/python %srandomSampleFastQ.py %s %s --sample_size %s' % (PYTHON_DIR,SCRIPT_DIR,sampleFileDict[sampleID]['RAW_FASTQ_FILE1'],SampledPrefix,args.SAMPLE_SIZE) if sampleDesignDict[sampleID]['isPaired']: SampledFastQCommand += ' --fastq_file2 %s' % (sampleFileDict[sampleID]['RAW_FASTQ_FILE2'])