def assemble_reads(sampleList): ## get the files def get_fq_files(sample): sampleDir = os.path.join(seqDir, "Sample_%s" % sample, "raw_illumina_reads") leftFile, rightFile = None, None for fileName in os.listdir(sampleDir): if re.search("^%s.*\.fastq$" % sample, fileName) and re.search( "R1", fileName): leftFile = os.path.realpath(os.path.join(sampleDir, fileName)) elif re.search("^%s.*.fastq$" % sample, fileName) and re.search( "R2", fileName): rightFile = os.path.realpath(os.path.join(sampleDir, fileName)) return leftFile, rightFile for sample in sampleList: print("\nprocessing sample: %s" % sample) left_fq, right_fq = get_fq_files(sample) ## run trimmomatic (assuming FR) out1 = os.path.join(analysisDir, "%s_left_paired.fq" % sample) out2 = os.path.join(analysisDir, "%s_left_unpaired.fq" % sample) out3 = os.path.join(analysisDir, "%s_right_paired.fq" % sample) out4 = os.path.join(analysisDir, "%s_right_unpaired.fq" % sample) cmd = "java -jar /usr/share/java/trimmomatic-0.32.jar PE -threads 29 -phred33 " +\ "%s %s "%(left_fq,right_fq) +\ "%s %s "%(out1,out2)+\ "%s %s "%(out3,out4)+\ "ILLUMINACLIP:/usr/src/trinityrnaseq_r20140717/trinity-plugins/Trimmomatic-0.32/adapters/TruSeq3-PE.fa:2:30:10 "+\ "LEADING:5 TRAILING:5 SLIDINGWINDOW:4:15 MINLEN:36" if not os.path.exists(out1): run_subprocess(cmd) ## remove #for fileName in faFileList: # print("removing %s"%fileName # os.remove(fileName) print "complete"
def get_count_matrix(sample, overwrite=False): print("getting counts for sample %s" % sample) gtfFilePath = os.path.realpath(os.path.join(seqDir, "Xentr7_2_Stable.gtf")) samFilePath = os.path.join(starDir, "%s_aligned_sorted.sam" % sample) countsFilePath = os.path.join(analysisDir, 'counts_%s.txt' % sample.lower()) if not os.path.exists(samFilePath): raise Exception("cannot find sam file %s" % (samFilePath)) if os.path.exists(countsFilePath) and overwrite: print("...removing old count file '%s'" % countsFilePath) os.remove(countsFilePath) elif os.path.exists(countsFilePath) and not overwrite: print("...'%s' exists use 'overwrite' to remove" % countsFilePath) return ## save the count matrix print('...running htseq-count') cmd = "/usr/bin/htseq-count %s %s > %s" % (samFilePath, gtfFilePath, countsFilePath) run_subprocess(cmd)
def get_count_matrix(sample, clean=False): print("getting counts for sample %s" % sample) bamFilePath = os.path.join(seqDir, "Sample_%s" % sample, "tophat_remapping", "accepted_hits.bam") sbamFilePath = os.path.join(seqDir, "Sample_%s" % sample, "tophat_remapping", "accepted_hits_sorted.bam") samFilePath = os.path.join(seqDir, "Sample_%s" % sample, "tophat_remapping", "accepted_hits.sam") gtsFilePath = os.path.join(seqDir, "Sample_%s" % sample, "cufflinks_output", "transcripts.gtf") countsFilePath = os.path.join(analysisDir, 'counts_%s.txt' % sample.lower()) warningsFilePath = os.path.join(analysisDir, 'warnings_%s.txt' % sample.lower()) ## clean if clean == True: for filePath in [sbamFilePath, samFilePath, countsFilePath]: if os.path.exists(filePath): os.remove(filePath) ## error check if not os.path.exists(bamFilePath): raise Exception("Invalid file path\n...%s" % bamFilePath) if not os.path.exists(gtsFilePath): raise Exception("Invalid file path\n...%s" % gtsFilePath) ## sort bam file if not os.path.exists(sbamFilePath): print('creating sorted bam file...') cmd = "/usr/bin/samtools sort -n %s %s" % (bamFilePath, sbamFilePath[:-4]) run_subprocess(cmd) ## convert to sam file if not os.path.exists(samFilePath): print('converting bam to sam file...') cmd = "/usr/bin/samtools view -h -o %s %s" % (samFilePath, sbamFilePath) run_subprocess(cmd) ## save the count matrix print('running htseq-count') #cmd = "/usr/bin/htseq-count %s %s > %s"%(samFilePath,gtsFilePath,countsFilePath) cmd = "/usr/bin/htseq-count -m intersection-nonempty -s yes %s %s > %s 2> %s" % ( samFilePath, gtsFilePath, countsFilePath, warningsFilePath) run_subprocess(cmd)
## specify the locations homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "xenopus") readsDir = os.path.join(homeDir, 'reads') ## more variables email = "*****@*****.**" trinityDir = "/usr/src/trinityrnaseq_r20140717/" if __name__ == "__main__": for method in ['dn', 'gg']: featuresDir = os.path.join(homeDir, "%s-trinity" % method, "features") for source in ['genes', 'isoforms']: countMatrixPath = os.path.join(featuresDir, "Trinity_%s.counts.matrix" % source) ## run DESeq outputPath = os.path.join( featuresDir, "deseq_%s_%s_de.csv" % (source, 'behavior')) cmd = "Rscript runDESeq.R %s %s" % (countMatrixPath, outputPath) print("running...\n%s" % cmd) run_subprocess(cmd) ## run edgeR outputPath = os.path.join(featuresDir, "edger_%s_behavior_de.csv" % source) cmd = "Rscript runEdgeR.R %s %s" % (countMatrixPath, outputPath) print("running...\n%s" % cmd) run_subprocess(cmd)
def assemble_reads(sampleList): ## get the files def get_paired_files(sample): leftFile, rightFile = None, None for fileName in os.listdir(readsDir): if re.search("unpaired", fileName): continue if re.search("^%s.*\.fq$" % sample, fileName) and re.search( "left", fileName): leftFile = os.path.realpath(os.path.join(readsDir, fileName)) elif re.search("^%s.*.fq$" % sample, fileName) and re.search( "right", fileName): rightFile = os.path.realpath(os.path.join(readsDir, fileName)) return leftFile, rightFile allLeft = [] allRight = [] for sample in sampleList: left, right = get_paired_files(sample) allLeft.append(left) allRight.append(right) ## check if len(allLeft) != len(sampleList): raise Exception("Invalid number of left sequences") if len(allRight) != len(sampleList): raise Exception("Invalid number of right sequences") if None in allLeft or None in allRight: raise Exception("Were the sequences prepped?") ## first prepare the reference cmd = "export TRINITY_HOME=%s;\n"%(trinityDir)+\ "$TRINITY_HOME/util/align_and_estimate_abundance.pl --transcripts %s "%(transcriptsFilePath)+\ "--est_method RSEM --aln_method bowtie --trinity_mode --prep_reference --output_dir %s"%(featuresDir) print('preping reference...') run_subprocess(cmd) print('reference prepreation complete.') for s, sample in enumerate(sampleList): cmd = "export TRINITY_HOME=%s;\n"%(trinityDir)+\ "$TRINITY_HOME/util/align_and_estimate_abundance.pl --transcripts %s "%(transcriptsFilePath)+\ "--seqType fq --left %s --right %s "%(allLeft[s],allRight[s])+\ "--est_method RSEM --aln_method bowtie --trinity_mode "+\ "--output_dir %s --output_prefix %s"%(featuresDir,sample) if cluster == True: submitFile = os.path.join(clusterDir, "%s-%s.sh" % (sample, s)) submitLog = os.path.join(clusterDir, "%s-%s.log" % (sample, s)) f = open(submitFile, 'w') f.write("#!/bin/bash\n"+\ "#$ -S /bin/bash\n"+\ "#$ -j yes\n"+\ "#S -M %s\n"%email+\ "#$ -o %s\n"%submitLog+\ "export TRINITY_HOME=%s\n"%(trinityDir)+\ cmd) f.close() print("submitting %s" % submitFile) os.system("qsub " + submitFile) else: print("running...\n%s" % cmd)
def assemble_reads(sampleList): ## get the files def get_gz_files(sample, laneDir): leftFile, rightFile = None, None for fileName in os.listdir(laneDir): if re.search("^%s.*\.fastq.gz$" % sample, fileName) and re.search( "R1", fileName): leftFile = os.path.realpath(os.path.join(laneDir, fileName)) elif re.search("^%s.*.fastq.gz$" % sample, fileName) and re.search( "R2", fileName): rightFile = os.path.realpath(os.path.join(laneDir, fileName)) return leftFile, rightFile lane1Dir = os.path.join(seqDir, "pieris-lane-1", "RawData") lane2Dir = os.path.join(seqDir, "pieris-lane-2", "RawData") for sample in sampleList: print("\nprocessing sample: %s" % sample) left_gz1, right_gz1 = get_gz_files(sample, lane1Dir) left_gz2, right_gz2 = get_gz_files(sample, lane2Dir) ## unzip files into analysis dir for source in [left_gz1, left_gz2, right_gz1, right_gz2]: target = os.path.join(analysisDir, os.path.split(source)[-1][:-3]) if not os.path.exists(target): unzip_file(source, target) ## concat files into analysis dir faFileList = [ os.path.join(analysisDir, os.path.split(source)[-1][:-3]) for f in [left_gz1, left_gz2, right_gz1, right_gz2] ] allLeftFile = os.path.join(analysisDir, "%s_left.fq" % sample) allRightFile = os.path.join(analysisDir, "%s_right.fq" % sample) catLeft = "cat %s %s > %s" % (faFileList[0], faFileList[1], allLeftFile) catRight = "cat %s %s > %s" % (faFileList[2], faFileList[3], allRightFile) print("concatenating lanes...") if not os.path.exists(allLeftFile): print catLeft run_subprocess(catLeft) if not os.path.exists(allRightFile): print catRight run_subprocess(catRight) ## run trimmomatic (assuming FR) out1 = os.path.join(analysisDir, "%s_left_paired.fq" % sample) out2 = os.path.join(analysisDir, "%s_left_unpaired.fq" % sample) out3 = os.path.join(analysisDir, "%s_right_paired.fq" % sample) out4 = os.path.join(analysisDir, "%s_right_unpaired.fq" % sample) cmd = "java -jar /usr/share/java/trimmomatic-0.32.jar PE -threads 29 -phred33 " +\ "%s %s "%(allLeftFile,allRightFile) +\ "%s %s "%(out1,out2)+\ "%s %s "%(out3,out4)+\ "ILLUMINACLIP:/usr/src/trinityrnaseq-2.0.4/trinity-plugins/Trimmomatic-0.32/adapters/TruSeq3-PE.fa:2:30:10 "+\ "LEADING:5 TRAILING:5 SLIDINGWINDOW:4:15 MINLEN:36" if not os.path.exists(out1): run_subprocess(cmd) ## remove for fileName in faFileList: if os.path.exists(fileName): print("removing %s" % fileName) os.remove(fileName) print "complete"
def unzip_file(source, target): print('unzipping...%s' % source) cmd = "gunzip -c %s > %s" % (source, target) print cmd run_subprocess(cmd)
def run_deseq(countsPath, outFile): cmd = "Rscript runDESeq.R %s %s" % (countsPath, outFile) print("running...\n%s" % cmd) run_subprocess(cmd)
bamFile = os.path.join(readsDir, "%s_aligned.bam" % (sample)) sbamFile = os.path.join(readsDir, "%s_aligned_sorted.bam" % (sample)) ssamFile = os.path.join(readsDir, "%s_aligned_sorted.sam" % (sample)) if not os.path.exists(samFile): raise Exception("cannot find sam file %s" % (samFile)) if os.path.exists(ssamFile) and not overwrite: print("skipping sam to bam, align, bam to sam") else: cmd = "/usr/bin/samtools view -b -S %s > %s && " % (samFile, bamFile) cmd += "/usr/bin/samtools sort -n %s %s && " % (bamFile, sbamFile[:-4]) cmd += "/usr/bin/samtools view -h %s > %s" % (sbamFile, ssamFile) print cmd run_subprocess(cmd) ## concat sam files and sort by coordinates print("\n...make single sorted bam file..") outBam = os.path.join(homeDir, "star_all_reads.bam") outSbam = os.path.join(homeDir, "star_all_reads_sorted.bam") cmdMerge = "/usr/bin/samtools merge %s" % (outBam) cmdSort = "/usr/bin/samtools sort %s %s" % (outBam, outSbam[:-4]) for s, sample in enumerate(sampleList): sbamFile = os.path.join(readsDir, "%s_aligned_sorted.bam" % (sample)) cmdMerge += " %s" % (sbamFile) cmdMergeSort = cmdMerge + " && %s" % (cmdSort) if os.path.exists(outSbam) and not overwrite: print("skipping concat bam files and sort")