Ejemplo n.º 1
0
def assemble_reads(sampleList):
    ## get the files
    def get_fq_files(sample):
        sampleDir = os.path.join(seqDir, "Sample_%s" % sample,
                                 "raw_illumina_reads")
        leftFile, rightFile = None, None
        for fileName in os.listdir(sampleDir):
            if re.search("^%s.*\.fastq$" % sample, fileName) and re.search(
                    "R1", fileName):
                leftFile = os.path.realpath(os.path.join(sampleDir, fileName))
            elif re.search("^%s.*.fastq$" % sample, fileName) and re.search(
                    "R2", fileName):
                rightFile = os.path.realpath(os.path.join(sampleDir, fileName))
        return leftFile, rightFile

    for sample in sampleList:
        print("\nprocessing sample: %s" % sample)
        left_fq, right_fq = get_fq_files(sample)

        ## run trimmomatic (assuming FR)
        out1 = os.path.join(analysisDir, "%s_left_paired.fq" % sample)
        out2 = os.path.join(analysisDir, "%s_left_unpaired.fq" % sample)
        out3 = os.path.join(analysisDir, "%s_right_paired.fq" % sample)
        out4 = os.path.join(analysisDir, "%s_right_unpaired.fq" % sample)

        cmd = "java -jar /usr/share/java/trimmomatic-0.32.jar PE -threads 29 -phred33 " +\
              "%s %s "%(left_fq,right_fq) +\
              "%s %s "%(out1,out2)+\
              "%s %s "%(out3,out4)+\
              "ILLUMINACLIP:/usr/src/trinityrnaseq_r20140717/trinity-plugins/Trimmomatic-0.32/adapters/TruSeq3-PE.fa:2:30:10 "+\
              "LEADING:5 TRAILING:5 SLIDINGWINDOW:4:15 MINLEN:36"

        if not os.path.exists(out1):
            run_subprocess(cmd)

        ## remove
        #for fileName in faFileList:
        #    print("removing %s"%fileName
        #    os.remove(fileName)

    print "complete"
Ejemplo n.º 2
0
def get_count_matrix(sample, overwrite=False):
    print("getting counts for sample %s" % sample)

    gtfFilePath = os.path.realpath(os.path.join(seqDir, "Xentr7_2_Stable.gtf"))
    samFilePath = os.path.join(starDir, "%s_aligned_sorted.sam" % sample)
    countsFilePath = os.path.join(analysisDir,
                                  'counts_%s.txt' % sample.lower())

    if not os.path.exists(samFilePath):
        raise Exception("cannot find sam file %s" % (samFilePath))

    if os.path.exists(countsFilePath) and overwrite:
        print("...removing old count file '%s'" % countsFilePath)
        os.remove(countsFilePath)
    elif os.path.exists(countsFilePath) and not overwrite:
        print("...'%s' exists use 'overwrite' to remove" % countsFilePath)
        return

    ## save the count matrix
    print('...running htseq-count')
    cmd = "/usr/bin/htseq-count %s %s > %s" % (samFilePath, gtfFilePath,
                                               countsFilePath)
    run_subprocess(cmd)
def get_count_matrix(sample, clean=False):
    print("getting counts for sample %s" % sample)

    bamFilePath = os.path.join(seqDir, "Sample_%s" % sample,
                               "tophat_remapping", "accepted_hits.bam")
    sbamFilePath = os.path.join(seqDir, "Sample_%s" % sample,
                                "tophat_remapping", "accepted_hits_sorted.bam")
    samFilePath = os.path.join(seqDir, "Sample_%s" % sample,
                               "tophat_remapping", "accepted_hits.sam")
    gtsFilePath = os.path.join(seqDir, "Sample_%s" % sample,
                               "cufflinks_output", "transcripts.gtf")
    countsFilePath = os.path.join(analysisDir,
                                  'counts_%s.txt' % sample.lower())
    warningsFilePath = os.path.join(analysisDir,
                                    'warnings_%s.txt' % sample.lower())

    ## clean
    if clean == True:
        for filePath in [sbamFilePath, samFilePath, countsFilePath]:
            if os.path.exists(filePath):
                os.remove(filePath)

    ## error check
    if not os.path.exists(bamFilePath):
        raise Exception("Invalid file path\n...%s" % bamFilePath)
    if not os.path.exists(gtsFilePath):
        raise Exception("Invalid file path\n...%s" % gtsFilePath)

    ## sort bam file
    if not os.path.exists(sbamFilePath):
        print('creating sorted bam file...')
        cmd = "/usr/bin/samtools sort -n %s %s" % (bamFilePath,
                                                   sbamFilePath[:-4])
        run_subprocess(cmd)

    ## convert to sam file
    if not os.path.exists(samFilePath):
        print('converting bam to sam file...')
        cmd = "/usr/bin/samtools view -h -o %s %s" % (samFilePath,
                                                      sbamFilePath)
        run_subprocess(cmd)

    ## save the count matrix
    print('running htseq-count')
    #cmd = "/usr/bin/htseq-count %s %s > %s"%(samFilePath,gtsFilePath,countsFilePath)
    cmd = "/usr/bin/htseq-count -m intersection-nonempty -s yes %s %s > %s 2> %s" % (
        samFilePath, gtsFilePath, countsFilePath, warningsFilePath)
    run_subprocess(cmd)
Ejemplo n.º 4
0
## specify the locations
homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "xenopus")
readsDir = os.path.join(homeDir, 'reads')

## more variables
email = "*****@*****.**"
trinityDir = "/usr/src/trinityrnaseq_r20140717/"

if __name__ == "__main__":

    for method in ['dn', 'gg']:
        featuresDir = os.path.join(homeDir, "%s-trinity" % method, "features")
        for source in ['genes', 'isoforms']:
            countMatrixPath = os.path.join(featuresDir,
                                           "Trinity_%s.counts.matrix" % source)

            ## run DESeq
            outputPath = os.path.join(
                featuresDir, "deseq_%s_%s_de.csv" % (source, 'behavior'))
            cmd = "Rscript runDESeq.R %s %s" % (countMatrixPath, outputPath)
            print("running...\n%s" % cmd)
            run_subprocess(cmd)

            ## run edgeR
            outputPath = os.path.join(featuresDir,
                                      "edger_%s_behavior_de.csv" % source)
            cmd = "Rscript runEdgeR.R %s %s" % (countMatrixPath, outputPath)
            print("running...\n%s" % cmd)
            run_subprocess(cmd)
Ejemplo n.º 5
0
def assemble_reads(sampleList):
    ## get the files
    def get_paired_files(sample):
        leftFile, rightFile = None, None
        for fileName in os.listdir(readsDir):
            if re.search("unpaired", fileName):
                continue

            if re.search("^%s.*\.fq$" % sample, fileName) and re.search(
                    "left", fileName):
                leftFile = os.path.realpath(os.path.join(readsDir, fileName))
            elif re.search("^%s.*.fq$" % sample, fileName) and re.search(
                    "right", fileName):
                rightFile = os.path.realpath(os.path.join(readsDir, fileName))
        return leftFile, rightFile

    allLeft = []
    allRight = []
    for sample in sampleList:
        left, right = get_paired_files(sample)
        allLeft.append(left)
        allRight.append(right)

    ## check
    if len(allLeft) != len(sampleList):
        raise Exception("Invalid number of left sequences")
    if len(allRight) != len(sampleList):
        raise Exception("Invalid number of right sequences")

    if None in allLeft or None in allRight:
        raise Exception("Were the sequences prepped?")

    ## first prepare the reference
    cmd = "export TRINITY_HOME=%s;\n"%(trinityDir)+\
          "$TRINITY_HOME/util/align_and_estimate_abundance.pl --transcripts %s "%(transcriptsFilePath)+\
          "--est_method RSEM --aln_method bowtie --trinity_mode --prep_reference --output_dir %s"%(featuresDir)

    print('preping reference...')
    run_subprocess(cmd)
    print('reference prepreation complete.')

    for s, sample in enumerate(sampleList):
        cmd = "export TRINITY_HOME=%s;\n"%(trinityDir)+\
              "$TRINITY_HOME/util/align_and_estimate_abundance.pl --transcripts %s "%(transcriptsFilePath)+\
              "--seqType fq --left %s --right %s "%(allLeft[s],allRight[s])+\
              "--est_method RSEM --aln_method bowtie --trinity_mode "+\
              "--output_dir %s --output_prefix %s"%(featuresDir,sample)

        if cluster == True:
            submitFile = os.path.join(clusterDir, "%s-%s.sh" % (sample, s))
            submitLog = os.path.join(clusterDir, "%s-%s.log" % (sample, s))

            f = open(submitFile, 'w')
            f.write("#!/bin/bash\n"+\
                    "#$ -S /bin/bash\n"+\
                    "#$ -j yes\n"+\
                    "#S -M %s\n"%email+\
                    "#$ -o %s\n"%submitLog+\
                    "export TRINITY_HOME=%s\n"%(trinityDir)+\
                    cmd)

            f.close()

            print("submitting %s" % submitFile)
            os.system("qsub " + submitFile)

        else:
            print("running...\n%s" % cmd)
Ejemplo n.º 6
0
def assemble_reads(sampleList):
    ## get the files
    def get_gz_files(sample, laneDir):
        leftFile, rightFile = None, None
        for fileName in os.listdir(laneDir):
            if re.search("^%s.*\.fastq.gz$" % sample, fileName) and re.search(
                    "R1", fileName):
                leftFile = os.path.realpath(os.path.join(laneDir, fileName))
            elif re.search("^%s.*.fastq.gz$" % sample, fileName) and re.search(
                    "R2", fileName):
                rightFile = os.path.realpath(os.path.join(laneDir, fileName))
        return leftFile, rightFile

    lane1Dir = os.path.join(seqDir, "pieris-lane-1", "RawData")
    lane2Dir = os.path.join(seqDir, "pieris-lane-2", "RawData")

    for sample in sampleList:
        print("\nprocessing sample: %s" % sample)
        left_gz1, right_gz1 = get_gz_files(sample, lane1Dir)
        left_gz2, right_gz2 = get_gz_files(sample, lane2Dir)

        ## unzip files into analysis dir
        for source in [left_gz1, left_gz2, right_gz1, right_gz2]:
            target = os.path.join(analysisDir, os.path.split(source)[-1][:-3])
            if not os.path.exists(target):
                unzip_file(source, target)

        ## concat files into analysis dir
        faFileList = [
            os.path.join(analysisDir,
                         os.path.split(source)[-1][:-3])
            for f in [left_gz1, left_gz2, right_gz1, right_gz2]
        ]
        allLeftFile = os.path.join(analysisDir, "%s_left.fq" % sample)
        allRightFile = os.path.join(analysisDir, "%s_right.fq" % sample)
        catLeft = "cat %s %s > %s" % (faFileList[0], faFileList[1],
                                      allLeftFile)
        catRight = "cat %s %s > %s" % (faFileList[2], faFileList[3],
                                       allRightFile)

        print("concatenating lanes...")
        if not os.path.exists(allLeftFile):
            print catLeft
            run_subprocess(catLeft)
        if not os.path.exists(allRightFile):
            print catRight
            run_subprocess(catRight)

        ## run trimmomatic (assuming FR)
        out1 = os.path.join(analysisDir, "%s_left_paired.fq" % sample)
        out2 = os.path.join(analysisDir, "%s_left_unpaired.fq" % sample)
        out3 = os.path.join(analysisDir, "%s_right_paired.fq" % sample)
        out4 = os.path.join(analysisDir, "%s_right_unpaired.fq" % sample)

        cmd = "java -jar /usr/share/java/trimmomatic-0.32.jar PE -threads 29 -phred33 " +\
              "%s %s "%(allLeftFile,allRightFile) +\
              "%s %s "%(out1,out2)+\
              "%s %s "%(out3,out4)+\
              "ILLUMINACLIP:/usr/src/trinityrnaseq-2.0.4/trinity-plugins/Trimmomatic-0.32/adapters/TruSeq3-PE.fa:2:30:10 "+\
              "LEADING:5 TRAILING:5 SLIDINGWINDOW:4:15 MINLEN:36"

        if not os.path.exists(out1):
            run_subprocess(cmd)

        ## remove
        for fileName in faFileList:
            if os.path.exists(fileName):
                print("removing %s" % fileName)
                os.remove(fileName)

    print "complete"
Ejemplo n.º 7
0
def unzip_file(source, target):
    print('unzipping...%s' % source)
    cmd = "gunzip -c %s > %s" % (source, target)
    print cmd
    run_subprocess(cmd)
Ejemplo n.º 8
0
def run_deseq(countsPath, outFile):
    cmd = "Rscript runDESeq.R %s %s" % (countsPath, outFile)
    print("running...\n%s" % cmd)
    run_subprocess(cmd)
Ejemplo n.º 9
0
        bamFile = os.path.join(readsDir, "%s_aligned.bam" % (sample))
        sbamFile = os.path.join(readsDir, "%s_aligned_sorted.bam" % (sample))
        ssamFile = os.path.join(readsDir, "%s_aligned_sorted.sam" % (sample))
        if not os.path.exists(samFile):
            raise Exception("cannot find sam file %s" % (samFile))

        if os.path.exists(ssamFile) and not overwrite:
            print("skipping sam to bam, align, bam to sam")
        else:
            cmd = "/usr/bin/samtools view -b -S %s > %s && " % (samFile,
                                                                bamFile)
            cmd += "/usr/bin/samtools sort -n %s %s && " % (bamFile,
                                                            sbamFile[:-4])
            cmd += "/usr/bin/samtools view -h %s > %s" % (sbamFile, ssamFile)
            print cmd
            run_subprocess(cmd)

    ## concat sam files and sort by coordinates
    print("\n...make single sorted bam file..")
    outBam = os.path.join(homeDir, "star_all_reads.bam")
    outSbam = os.path.join(homeDir, "star_all_reads_sorted.bam")
    cmdMerge = "/usr/bin/samtools merge %s" % (outBam)
    cmdSort = "/usr/bin/samtools sort %s %s" % (outBam, outSbam[:-4])
    for s, sample in enumerate(sampleList):
        sbamFile = os.path.join(readsDir, "%s_aligned_sorted.bam" % (sample))
        cmdMerge += " %s" % (sbamFile)

    cmdMergeSort = cmdMerge + " && %s" % (cmdSort)

    if os.path.exists(outSbam) and not overwrite:
        print("skipping concat bam files and sort")