def analyzeCounts(self, refKmers, readKmers, name):
        refSize, readSize = sum(refKmers.values()), sum(readKmers.values())
        outf = open(os.path.join(self.outputDir, name + "kmer_counts.txt"),
                    "w")
        outf.write(
            "kmer\trefCount\trefFraction\treadCount\treadFraction\tlogFoldChange\n"
        )
        if refSize > 0 and readSize > 0:
            for kmer in itertools.product("ATGC", repeat=5):
                refFraction, readFraction = 1.0 * refKmers[
                    kmer] / refSize, 1.0 * readKmers[kmer] / readSize
                if refFraction == 0:
                    foldChange = "-Inf"
                elif readFraction == 0:
                    foldChange = "Inf"
                else:
                    foldChange = -log(readFraction / refFraction)
                outf.write("\t".join(
                    map(str, [
                        "".join(kmer), refKmers[kmer], refFraction,
                        readKmers[kmer], readFraction, foldChange
                    ])) + "\n")
            outf.close()

            system("Rscript nanopore/analyses/kmer_analysis.R {} {} {} {} {}".
                   format(
                       os.path.join(self.outputDir, name + "kmer_counts.txt"),
                       os.path.join(self.outputDir,
                                    name + "pval_kmer_counts.txt"),
                       os.path.join(self.outputDir,
                                    name + "top_bot_sigkmer_counts.txt"),
                       os.path.join(self.outputDir, name + "volcano_plot.pdf"),
                       "Indel_Kmer"))
Esempio n. 2
0
 def testScriptTree_Example(self):
     """Uses the jobTreeTest code to test the scriptTree Target wrapper.
     """
     for test in xrange(self.testNo):
         command = "scriptTreeTest_Wrapper.py --jobTree %s --logLevel=INFO --retryCount=10" % self.jobTreeDir
         system(command)
         runJobTreeStatusAndFailIfNotComplete(self.jobTreeDir)
Esempio n. 3
0
 def run(self):
     for readType in self.readTypes:
         sortedBaseMappers = [
             x for x in sorted(self.baseMappers) if x != "Combined"
         ]
         outf = open(
             os.path.join(self.outputDir,
                          readType + "_perReadMappability.tsv"), "w")
         outf.write("Read\tReadFastqFile\t")
         outf.write("\t".join(sortedBaseMappers))
         outf.write("\n")
         for read in self.reads:
             if read.readType == readType:
                 tmp = od([[x, 0] for x in sortedBaseMappers])
                 if read.is_mapped is True:
                     for mapper, reference in read.get_map_ref_pair():
                         baseMapper = re.findall("[A-Z][a-z]*", mapper)[0]
                         #hacky way to avoid including 'combined' analysis
                         if baseMapper != "Combined" and tmp[
                                 baseMapper] == 0:
                             tmp[baseMapper] = 1
                 outf.write("\t".join(
                     [read.name,
                      os.path.basename(read.readFastqFile)] +
                     map(str, tmp.values())))
                 outf.write("\n")
         outf.close()
         system("Rscript nanopore/metaAnalyses/vennDiagram.R {} {}".format(
             os.path.join(self.outputDir,
                          readType + "_perReadMappability.tsv"),
             os.path.join(self.outputDir,
                          readType + "_perReadMappabilityVennDiagram.pdf")))
Esempio n. 4
0
    def run(self):
        AbstractAnalysis.run(self) #Call base method to do some logging
        refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences
        readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences
        sam = pysam.Samfile(self.samFile, "r" )
        indelCounters = map(lambda aR : IndelCounter(sam.getrname(aR.rname), refSequences[sam.getrname(aR.rname)], aR.qname, readSequences[aR.qname], aR), samIterator(sam)) #Iterate on the sam lines
        sam.close()
        #Write out the substitution info
        if len(indelCounters) > 0:
            indelXML = getAggregateIndelStats(indelCounters)
            open(os.path.join(self.outputDir, "indels.xml"), "w").write(prettyXml(indelXML))
            tmp = open(os.path.join(self.outputDir, "indels.tsv"), "w")
            #build list of data as vectors
            data_list = []
            var = ["readInsertionLengths", "readDeletionLengths", "ReadSequenceLengths", "NumberReadInsertions", "NumberReadDeletions", "MedianReadInsertionLengths", "MedianReadDeletionLengths"]
            for x in var:
                data_list.append([x] + indelXML.attrib[x].split())
            #transpose this list so R doesn't take hours to load it using magic
            data_list = map(None, *data_list)
            for line in data_list:
                tmp.write("\t".join(map(str,line))); tmp.write("\n")
            tmp.close()
            system("Rscript nanopore/analyses/indelPlots.R {} {}".format(os.path.join(self.outputDir, "indels.tsv"), os.path.join(self.outputDir, "indel_plots.pdf")))

        self.finish() #Indicates the batch is done
Esempio n. 5
0
 def testScriptTree_Example2(self):
     """Tests that the global and local temp dirs of a job behave as expected.
     """
     for test in xrange(self.testNo):
         command = "scriptTreeTest_Wrapper2.py --jobTree %s --logLevel=INFO --retryCount=0" % self.jobTreeDir
         system(command)
         runJobTreeStatusAndFailIfNotComplete(self.jobTreeDir)
def runJellyfish(localTempDir, countFile, fastqFile, uuid, kmerSize=49):
    """
    Runs jellyfish. -C flag is set to count both strands together.
    """
    jfFile = os.path.join(localTempDir, uuid + ".jf")
    system("jellyfish count -C -m {} -s 300M -o {} {}".format(kmerSize, jfFile, fastqFile))
    system("jellyfish dump {} > {}".format(jfFile, countFile))
Esempio n. 7
0
 def run(self, globalAlignment=False):
     AbstractAnalysis.run(self) #Call base method to do some logging
     refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences
     readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences
     sam = pysam.Samfile(self.samFile, "r" )
     readsToReadCoverages = {}
     for aR in samIterator(sam): #Iterate on the sam lines
         refSeq = refSequences[sam.getrname(aR.rname)]
         readSeq = readSequences[aR.qname]
         readAlignmentCoverageCounter = ReadAlignmentCoverageCounter(aR.qname, readSeq, sam.getrname(aR.rname), refSeq, aR, globalAlignment)
         if aR.qname not in readsToReadCoverages:
             readsToReadCoverages[aR.qname] = []
         readsToReadCoverages[aR.qname].append(readAlignmentCoverageCounter)
     sam.close()
     #Write out the coverage info for differing subsets of the read alignments
     if len(readsToReadCoverages.values()) > 0:
         for readCoverages, outputName in [ (reduce(lambda x, y : x + y, readsToReadCoverages.values()), "coverage_all"), (map(lambda x : max(x, key=lambda y : y.readCoverage()), readsToReadCoverages.values()), "coverage_bestPerRead") ]:
             parentNode = getAggregateCoverageStats(readCoverages, outputName, refSequences, readSequences, readsToReadCoverages, outputName)
             open(os.path.join(self.outputDir, outputName + ".xml"), 'w').write(prettyXml(parentNode))
             #this is a ugly file format with each line being a different data type - column length is variable
             outf = open(os.path.join(self.outputDir, outputName + ".txt"), "w")
             outf.write("MappedReadLengths " + parentNode.get("mappedReadLengths") + "\n")
             outf.write("UnmappedReadLengths " + parentNode.get("unmappedReadLengths") + "\n")
             outf.write("ReadCoverage " + parentNode.get("distributionreadCoverage") + "\n")
             outf.write("MismatchesPerReadBase " + parentNode.get("distributionmismatchesPerReadBase") + "\n")
             outf.write("ReadIdentity " + parentNode.get("distributionidentity") + "\n")
             outf.write("InsertionsPerBase " + parentNode.get("distributioninsertionsPerReadBase") + "\n")
             outf.write("DeletionsPerBase " + parentNode.get("distributiondeletionsPerReadBase") + "\n")
             outf.close()
             system("Rscript nanopore/analyses/coverage_plot.R {} {}".format(os.path.join(self.outputDir, outputName + ".txt"), os.path.join(self.outputDir, outputName + ".pdf")))
     self.finish()
Esempio n. 8
0
def Substitutions(readFastqFile,
                  referenceFastaFile,
                  samFile,
                  outputDir,
                  kmer=6):
    """Calculates stats on substitutions
    """
    refSequences = getFastaDictionary(
        referenceFastaFile)  #Hash of names to sequences
    readSequences = getFastqDictionary(
        readFastqFile)  #Hash of names to sequences
    sM = SubstitutionMatrix()  #The thing to store the counts in
    sam = pysam.Samfile(samFile, "r")
    for aR in samIterator(sam):  #Iterate on the sam lines
        for aP in AlignedPair.iterator(aR, refSequences[sam.getrname(
                aR.rname)], readSequences[
                    aR.qname]):  #Walk through the matches mismatches:
            sM.addAlignedPair(aP.getRefBase(), aP.getReadBase())
    sam.close()

    #Write out the substitution info
    open(os.path.join(outputDir, "substitutions.xml"),
         'w').write(prettyXml(sM.getXML()))
    bases = "ACGT"
    outf = open(os.path.join(outputDir, "subst.tsv"), "w")
    outf.write("A\tC\tG\tT\n")
    for x in bases:
        freqs = sM.getFreqs(x, bases)
        outf.write("{}\t{}\n".format(x, "\t".join(map(str, freqs)), "\n"))
    outf.close()
    analysis = str(samFile.split("/")[-1].split(".sam")[0])
    system("Rscript scripts/substitution_plot.R {} {} {}".format(
        os.path.join(outputDir, "subst.tsv"),
        os.path.join(outputDir, "substitution_plot.pdf"), analysis))
Esempio n. 9
0
    def run(self, kmerSize=5):
        self.kmerSize = kmerSize
        for readType in self.readTypes:
            mappedKmers, unmappedKmers = Counter(), Counter()
            for read in self.reads:
                if read.readType == readType and read.is_mapped:
                    mappedKmers += self.countKmers(read.seq)
                elif read.readType == readType:
                    unmappedKmers += self.countKmers(read.seq)

            mappedSize, unmappedSize = sum(mappedKmers.values()), sum(unmappedKmers.values())
            outf = open(os.path.join(self.getLocalTempDir(), readType + "_kmer_counts.txt"), "w")
            outf.write("kmer\tmappableCount\tmappableFraction\tunmappableCount\tunmappableFraction\tlogFoldChange\n")
            for kmer in itertools.product("ATGC",repeat=5):
                kmer = "".join(kmer)
                if mappedSize > 0:
                    mappedFraction = 1.0 * mappedKmers[kmer] / mappedSize
                else:
                    mappedFraction = 0
                if unmappedSize > 0:
                    unmappedFraction = 1.0 * unmappedKmers[kmer] / unmappedSize
                else:
                    unmappedFraction = 0
                if unmappedFraction == 0:
                    foldChange = "-Inf"
                elif mappedFraction == 0:
                    foldChange = "Inf"
                else:
                    foldChange = -log(mappedFraction / unmappedFraction)
                outf.write("\t".join(map(str,[kmer, mappedKmers[kmer], mappedFraction, unmappedKmers[kmer], unmappedFraction, foldChange]))+"\n")
            outf.close()

            system("Rscript nanopore/metaAnalyses/mappable_kmer_analysis.R {} {} {} {}".format(os.path.join(self.getLocalTempDir(), readType + "_kmer_counts.txt"), os.path.join(self.outputDir, readType + "_unmapped_kmer_counts.txt"), os.path.join(self.outputDir, readType + "_unmapped_top_bot_sigkmer_counts.txt"), os.path.join(self.outputDir, readType + "_volcano_plot.pdf")))
Esempio n. 10
0
    def run(self):
        AbstractAnalysis.run(self) #Call base method to do some logging
        refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences
        readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences
        sam = pysam.Samfile(self.samFile, "r" )

        #The data we collect
        avgPosteriorMatchProbabilityInCigar = []
        alignedPairsInCigar = []
        posteriorMatchProbabilities = []

        for aR in samIterator(sam): #Iterate on the sam lines
            #Exonerate format Cigar string
            cigarString = getExonerateCigarFormatString(aR, sam)
            
            #Temporary files
            tempCigarFile = os.path.join(self.getLocalTempDir(), "rescoredCigar.cig")
            tempRefFile = os.path.join(self.getLocalTempDir(), "ref.fa")
            tempReadFile = os.path.join(self.getLocalTempDir(), "read.fa")
            tempPosteriorProbsFile = os.path.join(self.getLocalTempDir(), "probs.tsv")
            
            #Write the temporary files.
            fastaWrite(tempRefFile, sam.getrname(aR.rname), refSequences[sam.getrname(aR.rname)]) 
            fastaWrite(tempReadFile, aR.qname, aR.query)
            
            #Trained hmm file to use.
            hmmFile = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_0.txt")
            
            #Call to cactus_realign
            system("echo %s | cactus_realign %s %s --rescoreByPosteriorProbIgnoringGaps --rescoreOriginalAlignment --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputPosteriorProbs=%s --loadHmm=%s > %s" % \
                   (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile, tempCigarFile))
            
            #Load the cigar and get the posterior prob
            assert len([ pA for pA in cigarRead(open(tempCigarFile)) ]) > 0
            assert len([ pA for pA in cigarRead(open(tempCigarFile)) ]) == 1
            pA = [ i for i in cigarRead(open(tempCigarFile)) ][0]
            avgPosteriorMatchProbabilityInCigar.append(pA.score)
            
            #Calculate the number of aligned pairs in the cigar
            alignedPairsInCigar.append(sum([ op.length for op in pA.operationList if op.type == PairwiseAlignment.PAIRWISE_MATCH ]))
            assert alignedPairsInCigar[-1] == len([ readPos for readPos, refPos in aR.aligned_pairs if readPos != None and refPos != None ])
            
            #Get the posterior probs
            #posteriorMatchProbabilities += [ float(line.split()[2]) for line in open(tempPosteriorProbsFile) ]
            
        sam.close()
        #Write out the substitution info
        node = ET.Element("alignmentUncertainty", { 
                "averagePosteriorMatchProbabilityPerRead":str(self.formatRatio(sum(avgPosteriorMatchProbabilityInCigar), len(avgPosteriorMatchProbabilityInCigar))),
                "averagePosteriorMatchProbability":str(self.formatRatio(float(sum([ avgMatchProb*alignedPairs for avgMatchProb, alignedPairs in zip(avgPosteriorMatchProbabilityInCigar, alignedPairsInCigar) ])),sum(alignedPairsInCigar))),
                "averagePosteriorMatchProbabilitesPerRead":",".join([ str(i) for i in avgPosteriorMatchProbabilityInCigar ]), 
                "alignedPairsInCigar":",".join([ str(i) for i in alignedPairsInCigar ]) })
        open(os.path.join(self.outputDir, "alignmentUncertainty.xml"), "w").write(prettyXml(node))
        if len(avgPosteriorMatchProbabilityInCigar) > 0:
            outf = open(os.path.join(self.getLocalTempDir(), "tmp_uncertainty"), "w")
            outf.write("\t".join([ str(i) for i in avgPosteriorMatchProbabilityInCigar ])); outf.write("\n")
            outf.close()
            system("Rscript nanopore/analyses/match_hist.R {} {}".format(os.path.join(self.getLocalTempDir(), "tmp_uncertainty"), os.path.join(self.outputDir, "posterior_prob_hist.pdf")))
        #Indicate everything is all done
        self.finish()
Esempio n. 11
0
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, 
                         referenceSequence, querySequenceFile, 
                         outputCigarFile, options):
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")
    
    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    
    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        loadHmm = nameValue("loadHmm", options.hmmFile)
        try:
            command = "echo %s | cPecanRealign %s %s --diagonalExpansion=10 \
            --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \
                   (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, 
                    options.gapGamma, options.matchGamma, outputCigarFile);
            system(command)
            # target.logToMaster('[good] ' + command + '\n');
        except Exception, e:            
            target.logToMaster('Caught an exception! qname = "%s"\n' % querySequenceName);
            target.logToMaster('len(exonerateCigarString[:-1]) = %d\n' % (len(exonerateCigarString[:-1])));
            target.logToMaster('[bad] Command that caused the exception:\n');
            target.logToMaster("echo %s | cPecanRealign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile));
            target.logToMaster('\n');
            target.logToMaster('\n');
            target.logToMaster(str(e) + '\n');
            target.logToMaster('\n');
            continue;
Esempio n. 12
0
 def run(self):
     os.chdir(self.directory)
     if self.paramFile is not None:
         cmd = "%s %s -b \"%s\" -t %s -s same -n %s -i %s -e %s -d %s -p %s %s >& jt.err" % (sys.executable, prepareExec, self.disc, self.paramFile, self.nullBatches, self.inferSpec, self.paradigmExec, self.dogmaLib, self.pathwayLib, self.evidSpec)
     else:
         cmd = "%s %s -b \"%s\" -s same -n %s -i %s -e %s -d %s -p %s %s >& jt.err" % (sys.executable, prepareExec, self.disc, self.nullBatches, self.inferSpec, self.paradigmExec, self.dogmaLib, self.pathwayLib, self.evidSpec)
     system(cmd)
     self.setFollowOnTarget(jtParadigm(self.em, self.directory))
Esempio n. 13
0
 def run(self, globalAlignment=False):
     AbstractAnalysis.run(self)  #Call base method to do some logging
     refSequences = getFastaDictionary(
         self.referenceFastaFile)  #Hash of names to sequences
     readSequences = getFastqDictionary(
         self.readFastqFile)  #Hash of names to sequences
     sam = pysam.Samfile(self.samFile, "r")
     readsToReadCoverages = {}
     for aR in samIterator(sam):  #Iterate on the sam lines
         refSeq = refSequences[sam.getrname(aR.rname)]
         readSeq = readSequences[aR.qname]
         readAlignmentCoverageCounter = ReadAlignmentCoverageCounter(
             aR.qname, readSeq, sam.getrname(aR.rname), refSeq, aR,
             globalAlignment)
         if aR.qname not in readsToReadCoverages:
             readsToReadCoverages[aR.qname] = []
         readsToReadCoverages[aR.qname].append(readAlignmentCoverageCounter)
     sam.close()
     #Write out the coverage info for differing subsets of the read alignments
     if len(readsToReadCoverages.values()) > 0:
         for readCoverages, outputName in [
             (reduce(lambda x, y: x + y,
                     readsToReadCoverages.values()), "coverage_all"),
             (map(lambda x: max(x, key=lambda y: y.readCoverage()),
                  readsToReadCoverages.values()), "coverage_bestPerRead")
         ]:
             parentNode = getAggregateCoverageStats(
                 readCoverages, outputName, refSequences, readSequences,
                 readsToReadCoverages, outputName)
             open(os.path.join(self.outputDir, outputName + ".xml"),
                  'w').write(prettyXml(parentNode))
             #this is a ugly file format with each line being a different data type - column length is variable
             outf = open(os.path.join(self.outputDir, outputName + ".txt"),
                         "w")
             outf.write("MappedReadLengths " +
                        parentNode.get("mappedReadLengths") + "\n")
             outf.write("UnmappedReadLengths " +
                        parentNode.get("unmappedReadLengths") + "\n")
             outf.write("ReadCoverage " +
                        parentNode.get("distributionreadCoverage") + "\n")
             outf.write(
                 "MismatchesPerReadBase " +
                 parentNode.get("distributionmismatchesPerReadBase") + "\n")
             outf.write("ReadIdentity " +
                        parentNode.get("distributionidentity") + "\n")
             outf.write(
                 "InsertionsPerBase " +
                 parentNode.get("distributioninsertionsPerReadBase") + "\n")
             outf.write("DeletionsPerBase " +
                        parentNode.get("distributiondeletionsPerReadBase") +
                        "\n")
             outf.close()
             system(
                 "Rscript nanopore/analyses/coverage_plot.R {} {}".format(
                     os.path.join(self.outputDir, outputName + ".txt"),
                     os.path.join(self.outputDir, outputName + ".pdf")))
     self.finish()
def downloadQuery(fastqPath, tempDir, key, queryString, uuid):
    """
    Downloads data from CGHub BAM Slicer
    """
    system(
        """curl --silent "{}" -u "{}" | samtools bamshuf -Ou /dev/stdin {} | samtools bam2fq /dev/stdin > {}""".format(
            queryString, "haussler:" + key, os.path.join(tempDir, "tmp"), fastqPath))
    if os.path.getsize(fastqPath) < 513:
        raise RuntimeError("curl did not download a BAM for {}. exiting.".format(uuid))
    def run(self):
        counts = defaultdict(list)
        for d in self.dict_iter():
            for x, y in d.iteritems():
                counts[x].append(y)

        G = pickle.load(open(self.graph))
        kmers = G.kmers

        added_counts = {}
        for k in kmers:
            added_counts[k] = sum(counts[k])

        with open(os.path.join(self.out_dir, "bad_kmers.fasta"), "w") as outf:
            for k in kmers:
                if added_counts[k] == 0:
                    G.G.edge[k + "_L"][k + "_R"]['bad'] = True
                    del added_counts[k]
                    outf.write(">{0}\n{0}\n".format(k))

        filtered_kmers = sorted(added_counts.iterkeys())

        with open(os.path.join(self.out_dir, "combined_counts.txt"), "w") as outf:
            for k in filtered_kmers:
                outf.write("{}\t{}\n".format(k, G.weights[k] * added_counts[k]))

        variances = {}
        for k in filtered_kmers:
            variances[k] = np.var(np.asarray(counts[k]))

        with open(os.path.join(self.out_dir, "variances.txt"), "w") as outf:
            for k in filtered_kmers:
                outf.write("{}\t{}\n".format(k, variances[k]))
        
        weights = {}
        for k in filtered_kmers:
            input_sequences = G.G.edge[k + "_L"][k + "_R"]['positions'].keys()
            weights[k] = 1.0 * len(self.count_files) * sum(avg_frac_dict[x] for x in input_sequences) / (added_counts[k] + 1)

        with open(os.path.join(self.out_dir, "weight_bad_kmers.fasta"), "w") as outf:
            for k in filtered_kmers:
                if weights[k] > 4.0 or weights[k] < 1.0:
                    G.G.edge[k + "_L"][k + "_R"]['bad'] = True
                    outf.write(">{}\n{}\n".format(weights[k], k))

        with open(os.path.join(self.out_dir, "weights.txt"), "w") as outf:
            for k in weights:
                outf.write("{}\t{}\n".format(k, weights[k]))

        weights = {x:y for x,y in weights.iteritems() if y <= 4.0 and y >= 1.0}
        
        G.weightKmers(weights)
        
        with open(self.new_graph, "w") as outf:
            pickle.dump(G, outf)

        system("Rscript src/weights.R {} {} {} {} {}".format(os.path.join(self.out_dir, "combined_counts.txt"), os.path.join(self.out_dir, "weights.txt"), os.path.join(self.out_dir, "variances.txt"), len(self.count_files), "weighting_metrics.pdf"))
Esempio n. 16
0
def posteriorProbabilityCalculationTargetFn(target, exonerateCigarStringFile,
                                            referenceSequenceName,
                                            referenceSequence,
                                            querySequenceFile,
                                            outputPosteriorProbsFile, options):
    """Calculates the posterior probabilities of matches in a set of pairwise
    alignments between a reference sequence and a set of reads. 
    """
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")

    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence)

    #Hash to store posterior probabilities in
    expectationsOfBasesAtEachPosition = {}

    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        tempPosteriorProbsFile = os.path.join(target.getLocalTempDir(),
                                              "posteriorProbs.txt")
        if options.noMargin:  #When we don't marginialize we just run cPecanRealign to get the list of aligned pairs
            #This runtime should be very fast
            system("echo %s | cPecanRealign %s %s --diagonalExpansion=0 \
            --splitMatrixBiggerThanThis=1 --rescoreOriginalAlignment --outputPosteriorProbs=%s"                                                                                                % \
                       (exonerateCigarString[:-1], tempRefFile, tempReadFile,
                        tempPosteriorProbsFile))
        else:
            system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \
            --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s"                                                                                       % \
                       (exonerateCigarString[:-1], tempRefFile, tempReadFile,
                        tempPosteriorProbsFile, options.alignmentModel))

        #Now collate the reference position expectations
        for refPosition, queryPosition, posteriorProb in \
        map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')):
            assert posteriorProb <= 1.01
            assert posteriorProb >= 0.0
            key = (referenceSequenceName, int(refPosition))
            if key not in expectationsOfBasesAtEachPosition:
                expectationsOfBasesAtEachPosition[key] = dict(
                    zip(BASES, [0.0] * len(BASES)))
            queryBase = querySequence[int(queryPosition)].upper()
            if queryBase in BASES:  #Could be an N or other wildcard character, which we ignore
                expectationsOfBasesAtEachPosition[key][
                    queryBase] += 1.0 if options.noMargin else posteriorProb

    #Pickle the posterior probs
    fileHandle = open(outputPosteriorProbsFile, 'w')
    cPickle.dump(expectationsOfBasesAtEachPosition, fileHandle,
                 cPickle.HIGHEST_PROTOCOL)
    fileHandle.close()
Esempio n. 17
0
def download_query(fastq_tmp_path, tmp_dir, key_file, query_string, uuid):
    """
    Downloads data from CGHub BAM Slicer
    """
    key = open(key_file).readline().rstrip()
    system("""curl --silent "{}" -u "{}" | samtools bamshuf -Ou - {} | samtools bam2fq - > {}""".format(
           query_string, "haussler:" + key, os.path.join(tmp_dir, "tmp"), fastq_tmp_path))
    os.remove(os.path.join(tmp_dir, "tmp"))
    if os.path.getsize(fastq_tmp_path) < 513:
        raise RuntimeError("curl did not download a BAM for {}. exiting.".format(uuid))
Esempio n. 18
0
 def run(self):
     os.chdir(self.directory)
     evidList = zip(re.split("\s", self.evidSpec)[0::2], re.split("\s", self.evidSpec)[1::2])
     
     ## assert files are in data/
     for i in evidList:
         assert(re.split(":", i[1])[1].startswith("data"))
     
     ## check if new run
     if not os.path.exists("fold1"):            
         ## find sample overlap
         dataSamples = None
         for i in evidList:
             if i[1].startswith("rawFile"):
                 if dataSamples is None:
                     dataSamples = set(retRows(re.split(":", i[1])[1]))
                 else:
                     dataSamples = dataSamples & set(retRows(re.split(":", i[1])[1]))
             else:
                 if dataSamples is None:
                     dataSamples = set(retColumns(re.split(":", i[1])[1]))
                 else:
                     dataSamples = dataSamples & set(retColumns(re.split(":", i[1])[1]))
         dataSamples = list(dataSamples)
         
         ## pick samples
         foldSamples = {}
         for f in range(1, self.mFolds+1):
             foldSamples[f] = []
         selectSamples = deepcopy(dataSamples)
         while len(selectSamples) > 0:
             for f in range(1, mFolds+1):
                 if len(selectSamples) > 0:
                     foldSamples[f].append(selectSamples.pop(random.randint(0,len(selectSamples)-1)))
     
         ## create directories and data
         for f in range(1, self.mFolds+1):
             system("mkdir fold%s" % (f))
             system("mkdir fold%s/train" % (f))
             system("mkdir fold%s/train/data" % (f))
             system("mkdir fold%s/test" % (f))
             system("mkdir fold%s/test/data" % (f))
             trainSamples = list(set(dataSamples) - set(foldSamples[f]))
             testSamples = foldSamples[f]
             for i in evidList:
                 if i[1].startswith("rawFile"):
                     rwCRSData("fold%s/train/%s" % (f, re.split(":", i[1])[1]), re.split(":", i[1])[1], useRows = trainSamples)
                     rwCRSData("fold%s/test/%s" % (f, re.split(":", i[1])[1]), re.split(":", i[1])[1], useRows = testSamples)
                 else:
                     rwCRSData("fold%s/train/%s" % (f, re.split(":", i[1])[1]), re.split(":", i[1])[1], useCols = trainSamples)
                     rwCRSData("fold%s/test/%s" % (f, re.split(":", i[1])[1]), re.split(":", i[1])[1], useCols = testSamples)
     
     ## kick off runs
     for f in range(1, self.mFolds+1):
         self.addChildTarget(branchTrain(self.evidSpec, self.disc, self.paramFile, self.paradigmExec, self.inferSpec, self.dogmaLib, self.pathwayLib, self.shuffleNode, self.nShuffle, "%s/fold%s" % (self.directory, f)))
Esempio n. 19
0
 def testSort(self):
     for test in xrange(self.testNo):
         tempDir = getTempDirectory(os.getcwd())
         tempFile1 = getTempFile(rootDir=tempDir)
         makeFileToSort(tempFile1)
         lines1 = loadFile(tempFile1)
         lines1.sort()
         sort(tempFile1)
         lines2 = loadFile(tempFile1)
         checkEqual(lines1, lines2)
         system("rm -rf %s" % tempDir)
Esempio n. 20
0
 def run(self):
     os.chdir(self.directory)
     private_arg = ""
     if self.private_paradigm:
         private_arg = "-z"
     if self.paramFile is not None:
         cmd = "prepareParadigm.py -b \"%s\" -t %s -s same -n %s -i %s -e %s -d %s -p %s %s %s >& jt.err" % (self.disc, self.paramFile, self.nullBatches, self.inferSpec, self.paradigmExec, self.dogmaLib, self.pathwayLib, private_arg, self.evidSpec)
     else:
         cmd = "prepareParadigm.py -b \"%s\" -s same -n %s -i %s -e %s -d %s -p %s %s %s >& jt.err" % (self.disc, self.nullBatches, self.inferSpec, self.paradigmExec, self.dogmaLib, self.pathwayLib, private_arg, self.evidSpec)
     system(cmd)
     self.setFollowOnTarget(jtParadigm(self.em, self.directory))
Esempio n. 21
0
    def run(self, kmerSize=5):
        self.kmerSize = kmerSize
        for readType in self.readTypes:
            mappedKmers, unmappedKmers = Counter(), Counter()
            for read in self.reads:
                if read.readType == readType and read.is_mapped:
                    mappedKmers += self.countKmers(read.seq)
                elif read.readType == readType:
                    unmappedKmers += self.countKmers(read.seq)

            mappedSize, unmappedSize = sum(mappedKmers.values()), sum(
                unmappedKmers.values())
            outf = open(
                os.path.join(self.getLocalTempDir(),
                             readType + "_kmer_counts.txt"), "w")
            outf.write(
                "kmer\tmappableCount\tmappableFraction\tunmappableCount\tunmappableFraction\tlogFoldChange\n"
            )
            for kmer in itertools.product("ATGC", repeat=5):
                kmer = "".join(kmer)
                if mappedSize > 0:
                    mappedFraction = 1.0 * mappedKmers[kmer] / mappedSize
                else:
                    mappedFraction = 0
                if unmappedSize > 0:
                    unmappedFraction = 1.0 * unmappedKmers[kmer] / unmappedSize
                else:
                    unmappedFraction = 0
                if unmappedFraction == 0:
                    foldChange = "-Inf"
                elif mappedFraction == 0:
                    foldChange = "Inf"
                else:
                    foldChange = -log(mappedFraction / unmappedFraction)
                outf.write("\t".join(
                    map(str, [
                        kmer, mappedKmers[kmer], mappedFraction,
                        unmappedKmers[kmer], unmappedFraction, foldChange
                    ])) + "\n")
            outf.close()

            system(
                "Rscript nanopore/metaAnalyses/mappable_kmer_analysis.R {} {} {} {}"
                .format(
                    os.path.join(self.getLocalTempDir(),
                                 readType + "_kmer_counts.txt"),
                    os.path.join(self.outputDir,
                                 readType + "_unmapped_kmer_counts.txt"),
                    os.path.join(
                        self.outputDir,
                        readType + "_unmapped_top_bot_sigkmer_counts.txt"),
                    os.path.join(self.outputDir,
                                 readType + "_volcano_plot.pdf")))
Esempio n. 22
0
 def testGetMidPoint(self):
     for test in xrange(self.testNo):
         tempDir = getTempDirectory(os.getcwd())
         tempFile = getTempFile(rootDir=tempDir)
         makeFileToSort(tempFile)
         l = open(tempFile, 'r').read()
         fileSize = os.path.getsize(tempFile)
         midPoint = getMidPoint(tempFile, 0, fileSize)
         print "the mid point is %i of a file of %i bytes woth byte" % (midPoint, fileSize)
         assert midPoint < fileSize
         assert l[midPoint] == '\n'
         assert midPoint >= 0
         system("rm -rf %s" % tempDir)
Esempio n. 23
0
def runJobTree(command, jobTreeDir, logLevel="DEBUG", retryCount=0, batchSystem="single_machine", 
               rescueJobFrequency=None):
    """A convenience function for running job tree from within a python script.
    """
    if rescueJobFrequency != None:
        rescueJobFrequencyString = "--rescueJobsFrequency %s" % float(rescueJobFrequency)
    else:
        rescueJobFrequencyString = ""
    command = "jobTree --command \"%s\" --jobTree %s --logLevel %s \
--retryCount %i --batchSystem %s %s" % \
            (command, jobTreeDir,  logLevel, retryCount, batchSystem, rescueJobFrequencyString)
    logger.info("Running command : %s" % command)
    system(command)
    logger.info("Ran the jobtree apparently okay")
 def run(self):
     os.chdir(self.directory)
     
     if paradigmPublic:
         for b in range(len(self.dataSamples)):
             system("cat outputFiles/%s_upstream_b%s_%s.fa >> %s_upstream.fa" % (self.mutatedGene, b, len(self.dataSamples), self.mutatedGene))
             system("cat outputFiles/%s_downstream_b%s_%s.fa >> %s_downstream.fa" % (self.mutatedGene, b, len(self.dataSamples), self.mutatedGene))
             for null in range(1, nNulls+1):
                 system("cat outputFiles/N%s_%s_upstream_b%s_%s.fa >> N%s_%s_upstream.fa" % (null, self.mutatedGene, b, len(self.dataSamples), null, self.mutatedGene))
                 system("cat outputFiles/N%s_%s_downstream_b%s_%s.fa >> N%s_%s_downstream.fa" % (null, self.mutatedGene, b, len(self.dataSamples), null, self.mutatedGene))
         system("rm -rf outputFiles")
     
     shiftCV(self.mutatedGene, self.mutatedSamples, self.dataSamples, self.trainSamples, 
             self.uPathway, self.dPathway, nNulls = nNulls)
Esempio n. 25
0
def posteriorProbabilityCalculationTargetFn(target, exonerateCigarStringFile, 
                referenceSequenceName, referenceSequence, querySequenceFile, 
                outputPosteriorProbsFile, options):
    """Calculates the posterior probabilities of matches in a set of pairwise
    alignments between a reference sequence and a set of reads. 
    """
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")
    
    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    
    #Hash to store posterior probabilities in
    expectationsOfBasesAtEachPosition = {}
    
    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        tempPosteriorProbsFile = os.path.join(target.getLocalTempDir(), "posteriorProbs.txt")
        if options.noMargin: #When we don't marginialize we just run cPecanRealign to get the list of aligned pairs
            #This runtime should be very fast
            system("echo %s | cPecanRealign %s %s --diagonalExpansion=0 \
            --splitMatrixBiggerThanThis=1 --rescoreOriginalAlignment --outputPosteriorProbs=%s" % \
                       (exonerateCigarString[:-1], tempRefFile, tempReadFile, 
                        tempPosteriorProbsFile))
        else:
            system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \
            --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s" % \
                       (exonerateCigarString[:-1], tempRefFile, tempReadFile, 
                        tempPosteriorProbsFile, options.alignmentModel))
        
        #Now collate the reference position expectations
        for refPosition, queryPosition, posteriorProb in \
        map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')):
            assert posteriorProb <= 1.01
            assert posteriorProb >= 0.0
            key = (referenceSequenceName, int(refPosition))
            if key not in expectationsOfBasesAtEachPosition:
                expectationsOfBasesAtEachPosition[key] = dict(zip(BASES, [0.0]*len(BASES)))
            queryBase = querySequence[int(queryPosition)].upper()
            if queryBase in BASES: #Could be an N or other wildcard character, which we ignore
                expectationsOfBasesAtEachPosition[key][queryBase] += 1.0 if options.noMargin else posteriorProb 
            
    #Pickle the posterior probs
    fileHandle = open(outputPosteriorProbsFile, 'w')
    cPickle.dump(expectationsOfBasesAtEachPosition, fileHandle, cPickle.HIGHEST_PROTOCOL)
    fileHandle.close() 
 def run(self):
     os.chdir(self.directory)
     
     ## cytoscape-web
     for mutatedGene in self.includeFeatures:
         if os.path.exists("analysis/%s/sig.tab" % (mutatedGene)):
             tableFiles = []
             tableFiles.append("analysis/%s/sig.tab" % (mutatedGene))
             tableFiles.append("msepPlot:analysis/%s/%s.msep.pdf" % (mutatedGene, mutatedGene))
             tableFiles.append("backgroundPlot:analysis/%s/%s.background.pdf" % (mutatedGene, mutatedGene))
             tableFiles.append("analysis/%s/avgAUC.tab" % (mutatedGene))
             tableFiles.append("analysis/%s/pshift.tab" % (mutatedGene))
             system("pathmark-report.py -t %s analysis/%s %s" % (",".join(tableFiles), mutatedGene, self.reportDir))
             system("cp analysis/%s/pshift* %s" % (mutatedGene, self.reportDir))
Esempio n. 27
0
def realignCigarTargetFn(target, exonerateCigarString, referenceSequenceName, referenceSequence, querySequenceName, querySequence, outputCigarFile, hmmFile, gapGamma, matchGamma):
    #Temporary files
    tempRefFile = os.path.join(target.getGlobalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getGlobalTempDir(), "read.fa")
    
    #Write the temporary files.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    fastaWrite(tempReadFile, querySequenceName, querySequence)

    #Call to cactus_realign
    loadHmm = nameValue("loadHmm", hmmFile)
    system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s > %s" % (exonerateCigarString, tempRefFile, tempReadFile, loadHmm, gapGamma, matchGamma, outputCigarFile))
    assert len([ pA for pA in cigarRead(open(outputCigarFile)) ]) > 0
    assert len([ pA for pA in cigarRead(open(outputCigarFile)) ]) == 1
Esempio n. 28
0
 def run(self):
     while True:
         command, logFile, jobID = self.inputQueue.get()
         #fnull = open(os.devnull, 'w') #Pipe the output to dev/null (it is caught by the slave and will be reported if there is an error)
         tempLogFile = getTempFile()
         fileHandle = open(tempLogFile, 'w')
         process = subprocess.Popen(command, shell=True, stdout = fileHandle, stderr = fileHandle)
         sts = os.waitpid(process.pid, 0)
         fileHandle.close()
         #fnull.close()
         if os.path.exists(tempLogFile):
             system("mv %s %s" % (tempLogFile, logFile))
         self.outputQueue.put((command, sts[1], jobID))
         self.inputQueue.task_done()
Esempio n. 29
0
 def testCopySubRangeOfFile(self):
     for test in xrange(self.testNo):
         tempDir = getTempDirectory(os.getcwd())
         tempFile = getTempFile(rootDir=tempDir)
         outputFile = getTempFile(rootDir=tempDir)
         makeFileToSort(tempFile)
         fileSize = os.path.getsize(tempFile)
         assert fileSize > 0
         fileStart = random.choice(xrange(0, fileSize))
         fileEnd = random.choice(xrange(fileStart, fileSize))
         copySubRangeOfFile(tempFile, fileStart, fileEnd, outputFile)
         l = open(outputFile, 'r').read()
         l2 = open(tempFile, 'r').read()[fileStart:fileEnd]
         checkEqual(l, l2)
         system("rm -rf %s" % tempDir)
Esempio n. 30
0
 def run(self):
     os.chdir(self.cwd)
     cmd = "%s -p outputFilesEM/*learn* " % collectParamsExec
     if (os.path.exists("mask.expectations")):
         cmd += " mask.expectations "
     cmd += "| %s -o params%i.txt /dev/stdin " \
                    % (collectParamsExec, self.iteration + 1)
     if (os.path.exists("mask.params")):
         cmd += " mask.params "
     system(cmd)
     if self.emHasTerminated():
         self.setFollowOnTarget(FinalRun(self.iteration + 1, self.cwd))
     else:
         self.setFollowOnTarget(ExpectationIteration(self.iteration + 1, 
                                                     self.tolerance, self.cwd))
Esempio n. 31
0
def parasolIsInstalled():
    """Returns True if parasol is installed, else False.
    """
    try:
        return system("parasol status") == 0
    except RuntimeError:
        return False
Esempio n. 32
0
 def run(self):
     os.chdir(self.cwd)
     cmd = "%s -p outputFilesEM/*learn* " % collectParamsExec
     if (os.path.exists("mask.expectations")):
         cmd += " mask.expectations "
     cmd += "| %s -o params%i.txt /dev/stdin " \
                    % (collectParamsExec, self.iteration + 1)
     if (os.path.exists("mask.params")):
         cmd += " mask.params "
     system(cmd)
     if self.emHasTerminated():
         self.setFollowOnTarget(FinalRun(self.iteration + 1, self.cwd))
     else:
         self.setFollowOnTarget(
             ExpectationIteration(self.iteration + 1, self.tolerance,
                                  self.cwd))
Esempio n. 33
0
    def run(self):
        os.chdir(self.cwd)
        system("rm -f params.txt")
        system("ln -s params%i.txt params.txt" % self.iteration)

        system("mkdir -p outputFilesEM%i" % self.iteration)
        system("rm -f outputFilesEM")
        system("ln -s outputFilesEM%i outputFilesEM" % self.iteration)

        sys.stderr.write("Current directory: " + os.getcwd() + "\n")
        jfile = open("jobsEM.list", "r")
        for job in jfile:
            self.addChildTarget(ParadigmCmd(job, self.cwd))
        jfile.close()
        self.setFollowOnTarget(
            MaximizationIteration(self.iteration, self.tolerance, self.cwd))
Esempio n. 34
0
def gridEngineIsInstalled():
    """Returns True if grid-engine is installed, else False.
    """
    try:
        return system("qstat -help") == 0
    except RuntimeError:
        return False
Esempio n. 35
0
    def run(self):
        os.chdir(self.cwd)
        system("rm -f params.txt")
        system("ln -s params%i.txt params.txt" % self.iteration)

        system("mkdir -p outputFilesEM%i" % self.iteration)
        system("rm -f outputFilesEM")
        system("ln -s outputFilesEM%i outputFilesEM" % self.iteration)

        sys.stderr.write("Current directory: " + os.getcwd() + "\n")
        jfile = open("jobsEM.list", "r")
        for job in jfile:
            self.addChildTarget(ParadigmCmd(job, self.cwd))
        jfile.close()
        self.setFollowOnTarget(MaximizationIteration(self.iteration, 
                                                     self.tolerance, self.cwd))
 def run(self):
     for readType in self.readTypes:
         sortedBaseMappers = [x for x in sorted(self.baseMappers) if x != "Combined"]
         outf = open(os.path.join(self.outputDir, readType + "_perReadMappability.tsv"), "w")
         outf.write("Read\tReadFastqFile\t"); outf.write("\t".join(sortedBaseMappers)); outf.write("\n")
         for read in self.reads:
             if read.readType == readType:
                 tmp = od([[x, 0] for x in sortedBaseMappers])
                 if read.is_mapped is True:
                     for mapper, reference in read.get_map_ref_pair():
                         baseMapper = re.findall("[A-Z][a-z]*", mapper)[0]
                         #hacky way to avoid including 'combined' analysis
                         if baseMapper != "Combined" and tmp[baseMapper] == 0:
                             tmp[baseMapper] = 1
                 outf.write("\t".join([read.name, os.path.basename(read.readFastqFile)] + map(str, tmp.values()))); outf.write("\n")
         outf.close()
         system("Rscript nanopore/metaAnalyses/vennDiagram.R {} {}".format(os.path.join(self.outputDir, readType + "_perReadMappability.tsv"), os.path.join(self.outputDir, readType + "_perReadMappabilityVennDiagram.pdf")))
    def run(self):
        os.chdir(self.directory)
        
        ## branch genes
        htmlFeatures = []
        if not os.path.exists("analysis"):
            system("mkdir analysis")
        for mutatedGene in self.mutationMap.keys():
            if not os.path.exists("analysis/%s" % (mutatedGene)):
                system("mkdir analysis/%s" % (mutatedGene))
                htmlFeatures.append(mutatedGene)
                self.addChildTarget(branchFolds(mutatedGene, self.mutationMap[mutatedGene], 
                                            self.dataSamples, self.dataFeatures, self.dataMap, 
                                            self.gPathway, self.paradigmDir, self.paramMap, 
											self.foldMap, self.directory))
        if os.path.exists(htmlDir):
            self.setFollowOnTarget(pshiftReport(htmlFeatures, "%s/%s" % (htmlDir, self.paramMap["cohortName"]), self.directory))
Esempio n. 38
0
 def analyzeCounts(self, refKmers, readKmers, name):
     refSize, readSize = sum(refKmers.values()), sum(readKmers.values())
     outf = open(os.path.join(self.outputDir, name + "kmer_counts.txt"), "w")
     outf.write("kmer\trefCount\trefFraction\treadCount\treadFraction\tlogFoldChange\n")
     if refSize > 0 and readSize > 0:
         for kmer in itertools.product("ATGC", repeat=5):
             refFraction, readFraction = 1.0 * refKmers[kmer] / refSize, 1.0 * readKmers[kmer] / readSize
             if refFraction == 0:
                 foldChange = "-Inf"
             elif readFraction == 0:
                 foldChange = "Inf"
             else:
                 foldChange = -log(readFraction / refFraction)
             outf.write("\t".join(map(str,["".join(kmer), refKmers[kmer], refFraction, readKmers[kmer], readFraction, foldChange]))+"\n")
         outf.close()
     
         system("Rscript nanopore/analyses/kmer_analysis.R {} {} {} {} {}".format(os.path.join(self.outputDir, name + "kmer_counts.txt"), os.path.join(self.outputDir, name + "pval_kmer_counts.txt"), os.path.join(self.outputDir, name + "top_bot_sigkmer_counts.txt"), os.path.join(self.outputDir, name + "volcano_plot.pdf"), "Indel_Kmer"))
Esempio n. 39
0
 def run(self):
     AbstractAnalysis.run(self)
     readSequences = getFastqDictionary(self.readFastqFile)
     nr = re.compile(r"channel_[0-9]+_read_[0-9]+")
     per_channel_read_counts = Counter([int(x.split("_")[1]) for x in readSequences.iterkeys() if re.match(nr, x)])
     sam = pysam.Samfile(self.samFile, "r")
     mapped_read_counts = Counter([int(aR.qname.split("_")[1]) for aR in samIterator(sam) if re.match(nr, aR.qname) and aR.is_unmapped is False])
     if len(mapped_read_counts) > 0 and len(per_channel_read_counts) > 0:
         outf = open(os.path.join(self.outputDir, "channel_mappability.tsv"), "w")
         outf.write("Channel\tReadCount\tMappableReadCount\n")
         max_channel = max(513, max(per_channel_read_counts.keys())) #in case there are more than 512 in the future
         for channel in xrange(1, max_channel):
             outf.write("\t".join(map(str, [channel, per_channel_read_counts[channel], mapped_read_counts[channel]])))
             outf.write("\n")
         outf.close()
         system("Rscript nanopore/analyses/channel_plots.R {} {} {} {} {}".format(os.path.join(self.outputDir, "channel_mappability.tsv"), os.path.join(self.outputDir, "channel_mappability.pdf"), os.path.join(self.outputDir, "channel_mappability_sorted.png"), os.path.join(self.outputDir, "mappability_levelplot.png"), os.path.join(self.outputDir, "mappability_leveplot_percent.png")))
     self.finish()
Esempio n. 40
0
def align_query(target, fastq, bam, uuid, index):
    """
    Aligns the extracted reads to the notch locus, filtering for unmapped reads and creating a custom reheadered BAM.
    """
    # align the extracted reads to the index
    tmp = os.path.join(target.getLocalTempDir(), "tmp")
    sorted_bam = os.path.join(target.getLocalTempDir(), "{}.sorted.bam".format(uuid))
    system("bwa mem -v 1 {} {} | samtools view -F 4 -bS - | samtools sort -O bam -T {} - > {}".format(index, fastq, tmp, sorted_bam))
    header = {"HD": {"VN": "1.3"}, "SQ": [{"LN": 248956422, "SN": "chr1"}]}
    outfile = pysam.Samfile(bam, "wb", header=header)
    bamfile = pysam.Samfile(sorted_bam, "rb")
    for record in bamfile:
        chrom, span = bamfile.getrname(record.tid).split(":")
        start, end = map(int, span.split("-"))
        record.pos = record.pos + start - 1
        outfile.write(record)
    outfile.close()
    system("samtools index {}".format(bam))
Esempio n. 41
0
def realignCigarTargetFn(target, exonerateCigarString, referenceSequenceName,
                         referenceSequence, querySequenceName, querySequence,
                         outputCigarFile, hmmFile, gapGamma, matchGamma):
    #Temporary files
    tempRefFile = os.path.join(target.getGlobalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getGlobalTempDir(), "read.fa")

    #Write the temporary files.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence)
    fastaWrite(tempReadFile, querySequenceName, querySequence)

    #Call to cactus_realign
    loadHmm = nameValue("loadHmm", hmmFile)
    system(
        "echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s > %s"
        % (exonerateCigarString, tempRefFile, tempReadFile, loadHmm, gapGamma,
           matchGamma, outputCigarFile))
    assert len([pA for pA in cigarRead(open(outputCigarFile))]) > 0
    assert len([pA for pA in cigarRead(open(outputCigarFile))]) == 1
Esempio n. 42
0
 def write_file_analyze(self, entries, name, multiple_read_types=False):
     path = os.path.join(self.outputDir, name + ".csv")
     outf = open(path, "w")
     outf.write(",".join([
         "Name", "Mapper", "ReadType", "ReadFile", "ReferenceFile",
         "AvgReadCoverage", "AvgReferenceCoverage", "AvgIdentity",
         "AvgMismatchesPerReadBase", "AvgDeletionsPerReadBase",
         "AvgInsertionsPerReadBase", "NumberOfMappedReads",
         "NumberOfUnmappedReads", "NumberOfReads"
     ]))
     outf.write("\n")
     entries = sorted(entries,
                      key=lambda x: (x.mapper, x.readType, x.readFastqFile))
     names = self.resolve_duplicate_rownames(entries, multiple_read_types)
     for entry, n in izip(entries, names):
         outf.write(",".join([
             n, entry.mapper, entry.readType, entry.readFastqFile,
             entry.referenceFastaFile, entry.XML.attrib["avgreadCoverage"],
             entry.XML.attrib["avgreferenceCoverage"],
             entry.XML.attrib["avgidentity"],
             entry.XML.attrib["avgmismatchesPerReadBase"],
             entry.XML.attrib["avgdeletionsPerReadBase"],
             entry.XML.attrib["avginsertionsPerReadBase"],
             entry.XML.attrib["numberOfMappedReads"],
             entry.XML.attrib["numberOfUnmappedReads"],
             entry.XML.attrib["numberOfReads"]
         ]) + "\n")
     outf.close()
     path2 = os.path.join(self.outputDir, name + "_distribution.csv")
     outf = open(path2, "w")
     for entry, n in izip(entries, names):
         outf.write(
             ",".join([n] +
                      entry.XML.attrib["distributionidentity"].split()))
         outf.write("\n")
     outf.close()
     system("Rscript nanopore/metaAnalyses/coverageSummaryPlots.R {} {} {}".
            format(
                path, name,
                os.path.join(self.outputDir, name + "_summary_plots.pdf")))
     system("Rscript nanopore/metaAnalyses/coveragePlots.R {} {} {}".format(
         path2, name,
         os.path.join(self.outputDir, name + "_distribution.pdf")))
Esempio n. 43
0
def writeScripts():
    """creates the R scripts necessary for plotting"""
    backgroundR = """#!/usr/bin/env Rscript
    args = commandArgs(TRUE)
    phenotype = args[1]
    Real = read.table(paste("stats_", phenotype, ".tab", sep=""), header=TRUE)
    Nulls = read.table(paste("stats_NULL_", phenotype, ".tab", sep=""), header=TRUE)
    nbreaks = 60
    
    zscore = c(as.character((Real$totNodes-mean(Nulls$totNodes))/sd(Nulls$totNodes)), as.character((Real$totLinks-mean(Nulls$totLinks))/sd(Nulls$totLinks)), as.character((Real$largest_netNodes-mean(Nulls$largest_netNodes))/sd(Nulls$largest_netNodes)), as.character((Real$largest_netLinks-mean(Nulls$largest_netLinks))/sd(Nulls$largest_netLinks)))
    fileConn = file(paste(phenotype, ".stats", sep=""))
    writeLines(zscore, fileConn)
    close(fileConn)
    
    xrange = c(min(Nulls$totNodes, Real$totNodes)-50, max(Nulls$totNodes, Real$totNodes)+50)
    png(paste(phenotype, "_total_netNodes.png", sep=""), heigh=720, width=1280)
    hist(Nulls$totNodes, breaks=nbreaks, xlim=xrange, xlab="Number", main=paste("Number of Nodes for Subnet, z = ", zscore[1], sep=""))
    abline(v = Real$totNodes, col="red", lty = 2)
    dev.off()
    
    xrange = c(min(Nulls$totLinks, Real$totLinks)-50, max(Nulls$totLinks, Real$totLinks)+50)
    png(paste(phenotype, "_total_netLinks.png", sep=""), heigh=720, width=1280)
    hist(Nulls$totLinks, breaks=nbreaks, xlim=xrange, xlab="Number", main=paste("Number of Links for Subnet, z = ", zscore[2], sep=""))
    abline(v = Real$totLinks, col="red", lty = 2)
    dev.off()
    
    xrange = c(min(Nulls$largest_netNodes, Real$largest_netNodes)-50, max(Nulls$largest_netNodes, Real$largest_netNodes)+50)
    png(paste(phenotype, "_largest_netNodes.png", sep=""), heigh=720, width=1280)
    hist(Nulls$largest_netNodes, breaks=nbreaks, xlim=xrange, xlab="Number", main=paste("Number of Nodes for Largest Component, z = ", zscore[3], sep=""))
    abline(v = Real$largest_netNodes, col="red", lty = 2)
    dev.off()
    
    xrange = c(min(Nulls$largest_netLinks, Real$largest_netLinks)-50, max(Nulls$largest_netLinks, Real$largest_netLinks)+50)
    png(paste(phenotype, "_largest_netLinks.png", sep=""), heigh=720, width=1280)
    hist(Nulls$largest_netLinks, breaks=nbreaks, xlim=xrange, xlab="Number", main=paste("Number of Links for Largest Component, z = ", zscore[4], sep=""))
    abline(v = Real$largest_netLinks, col="red", lty = 2)
    dev.off()
    """

    f = open("background.R", "w")
    f.write(backgroundR)
    f.close
    system("chmod 755 *.R")
def alignQuery(fastqPath, remappedBamPath, tempDir, uuid, index):
    """
    Aligns to the notch locus
    """
    # align the extracted reads to the index
    sortedBamPath = os.path.join(tempDir, "{}.sorted".format(uuid))
    system("bwa mem -v 1 {} {} | samtools view -F 4 -bS - | samtools sort - {}".format(index, fastqPath, sortedBamPath))
    # samtools appends .bam to sorted bam files
    sortedBamPath += ".bam"
    header = {"HD": {"VN": "1.3"}, "SQ": [{"LN": 248956422, "SN": "chr1"}]}
    outfile = pysam.Samfile(remappedBamPath, "wb", header=header)
    bamfile = pysam.Samfile(sortedBamPath, "rb")
    for record in bamfile:
        chrom, span = bamfile.getrname(record.tid).split(":")
        start, end = map(int, span.split("-"))
        record.pos = record.pos + start - 1
        outfile.write(record)
    outfile.close()
    system("samtools index {}".format(remappedBamPath))
Esempio n. 45
0
def run_jellyfish(target, jf_counts, k_plus1_mer_counts, fastq, uuid, kmer_size):
    """
    Runs jellyfish twice: the first time counts kmers with -C at kmer_size kmers. This is the raw data for the ILP
    model. Runs jellyfish a second time, with kmer_size +1 and the bloom filter which removes most kmers with counts of
    oe. This will be used to add individual nodes to the graph.
    """
    jf_file = os.path.join(target.getLocalTempDir(), uuid + ".jf")
    system("jellyfish count -C -m {} -s 200M -o {} {}".format(kmer_size, jf_file, fastq))
    system("jellyfish dump {} > {}".format(jf_file, jf_counts))
    os.remove(jf_file)
    system("jellyfish count -C -m {} --bf-size 1G -s 200M -o {} {}".format(kmer_size + 1, jf_file, fastq))
    system("jellyfish dump {} > {}".format(jf_file, k_plus1_mer_counts))
    os.remove(jf_file)
Esempio n. 46
0
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, 
                         referenceSequence, querySequenceFile, 
                         outputCigarFile, options):
    #Temporary files
    tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa")
    
    #Write the temporary reference file.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    
    #For each cigar string
    for exonerateCigarString, (querySequenceName, querySequence) in \
    zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)):
        fastaWrite(tempReadFile, querySequenceName, querySequence)
        #Call to cPecanRealign
        loadHmm = nameValue("loadHmm", options.hmmFile)
        system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \
        --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \
               (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, 
                options.gapGamma, options.matchGamma, outputCigarFile))
Esempio n. 47
0
 def delete(self):
     """Removes from disk atomically, can not then subsequently call read(), write() or addChildren()
     """
     os.remove(self.getJobFileName()) #This is the atomic operation, if this file is not present the job is deleted.
     dirToRemove = self.jobDir
     while 1:
         head, tail = os.path.split(dirToRemove)
         if re.match("t[0-9]+$", tail):
             command = "rm -rf %s" % dirToRemove
         else:
             command = "rm -rf %s/*" % dirToRemove #We're at the root
         try:
             system(command)
         except RuntimeError:
             pass #This is not a big deal, as we expect collisions
         dirToRemove = head
         try:
             if len(os.listdir(dirToRemove)) != 0:
                 break
         except os.error: #In case stuff went wrong, but as this is not critical we let it slide
             break
Esempio n. 48
0
    def run(self):
        AbstractAnalysis.run(self)  #Call base method to do some logging
        refSequences = getFastaDictionary(
            self.referenceFastaFile)  #Hash of names to sequences
        readSequences = getFastqDictionary(
            self.readFastqFile)  #Hash of names to sequences
        sam = pysam.Samfile(self.samFile, "r")
        indelCounters = map(lambda aR: IndelCounter(
            sam.getrname(aR.rname), refSequences[sam.getrname(
                aR.rname)], aR.qname, readSequences[aR.qname], aR),
                            samIterator(sam))  #Iterate on the sam lines
        sam.close()
        #Write out the substitution info
        if len(indelCounters) > 0:
            indelXML = getAggregateIndelStats(indelCounters)
            open(os.path.join(self.outputDir, "indels.xml"),
                 "w").write(prettyXml(indelXML))
            tmp = open(os.path.join(self.outputDir, "indels.tsv"), "w")
            #build list of data as vectors
            data_list = []
            var = [
                "readInsertionLengths", "readDeletionLengths",
                "ReadSequenceLengths", "NumberReadInsertions",
                "NumberReadDeletions", "MedianReadInsertionLengths",
                "MedianReadDeletionLengths"
            ]
            for x in var:
                data_list.append([x] + indelXML.attrib[x].split())
            #transpose this list so R doesn't take hours to load it using magic
            data_list = map(None, *data_list)
            for line in data_list:
                tmp.write("\t".join(map(str, line)))
                tmp.write("\n")
            tmp.close()
            system("Rscript nanopore/analyses/indelPlots.R {} {}".format(
                os.path.join(self.outputDir, "indels.tsv"),
                os.path.join(self.outputDir, "indel_plots.pdf")))

        self.finish()  #Indicates the batch is done
Esempio n. 49
0
 def run(self, kmer=5):
     AbstractAnalysis.run(self) #Call base method to do some logging
     refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences
     readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences
     sM = SubstitutionMatrix() #The thing to store the counts in
     sam = pysam.Samfile(self.samFile, "r" )
     for aR in samIterator(sam): #Iterate on the sam lines
         for aP in AlignedPair.iterator(aR, refSequences[sam.getrname(aR.rname)], readSequences[aR.qname]): #Walk through the matches mismatches:
             sM.addAlignedPair(aP.getRefBase(), aP.getReadBase())
     sam.close()
     #Write out the substitution info
     open(os.path.join(self.outputDir, "substitutions.xml"), 'w').write(prettyXml(sM.getXML()))
     bases = "ACGT"
     outf = open(os.path.join(self.outputDir, "subst.tsv"), "w")
     outf.write("A\tC\tG\tT\n")
     for x in bases:
         freqs = sM.getFreqs(x, bases)
         outf.write("{}\t{}\n".format(x, "\t".join(map(str,freqs)), "\n"))
     outf.close()
     analysis = self.outputDir.split("/")[-2].split("_")[-1] + "_Substitution_Levels"
     system("Rscript nanopore/analyses/substitution_plot.R {} {} {}".format(os.path.join(self.outputDir, "subst.tsv"), os.path.join(self.outputDir, "substitution_plot.pdf"), analysis))        
     self.finish()
 def run(self):
     AbstractAnalysis.run(self)
     readSequences = getFastqDictionary(self.readFastqFile)
     nr = re.compile(r"channel_[0-9]+_read_[0-9]+")
     per_channel_read_counts = Counter([
         int(x.split("_")[1]) for x in readSequences.iterkeys()
         if re.match(nr, x)
     ])
     sam = pysam.Samfile(self.samFile, "r")
     mapped_read_counts = Counter([
         int(aR.qname.split("_")[1]) for aR in samIterator(sam)
         if re.match(nr, aR.qname) and aR.is_unmapped is False
     ])
     if len(mapped_read_counts) > 0 and len(per_channel_read_counts) > 0:
         outf = open(
             os.path.join(self.outputDir, "channel_mappability.tsv"), "w")
         outf.write("Channel\tReadCount\tMappableReadCount\n")
         max_channel = max(513, max(per_channel_read_counts.keys())
                           )  #in case there are more than 512 in the future
         for channel in xrange(1, max_channel):
             outf.write("\t".join(
                 map(str, [
                     channel, per_channel_read_counts[channel],
                     mapped_read_counts[channel]
                 ])))
             outf.write("\n")
         outf.close()
         system("Rscript nanopore/analyses/channel_plots.R {} {} {} {} {}".
                format(
                    os.path.join(self.outputDir, "channel_mappability.tsv"),
                    os.path.join(self.outputDir, "channel_mappability.pdf"),
                    os.path.join(self.outputDir,
                                 "channel_mappability_sorted.png"),
                    os.path.join(self.outputDir,
                                 "mappability_levelplot.png"),
                    os.path.join(self.outputDir,
                                 "mappability_leveplot_percent.png")))
     self.finish()
Esempio n. 51
0
    def run(self):
        os.chdir(self.directory)

        system(
            "rm -rf real* null* OCCAM__* background.R LAYOUT/*.params LAYOUT/real_results.* LAYOUT/null_results.* LAYOUT/*.tab LAYOUT/NULL_*"
        )
        if self.outputZip is not None:
            system("zip -r LAYOUT.zip LAYOUT")
            system("mv -f LAYOUT.zip %s" % (self.outputZip))
Esempio n. 52
0
    def run(self):
        layoutDir = "%s/LAYOUT" % (self.directory)
        os.chdir(layoutDir)

        ## aggregate null scores
        if self.nNulls > 0:
            phenotypeName = re.split("/", self.phenotypeFile)[-1]
            if not os.path.exists("null_results.%s.tab" %
                                  (self.occamPhenotype)):
                nullScores = {}
                for null in range(1, self.nNulls + 1):
                    if len(
                            retColumns(
                                "../OCCAM__%s__null_%s.tab/results.tab" %
                                (phenotypeName, null))) == 0:
                        ## this is an error right now
                        continue
                    nullScores["N%s" % (null)] = rCRSData(
                        "../OCCAM__%s__null_%s.tab/results.tab" %
                        (phenotypeName, null))[self.occamPhenotype]
                wCRSData("null_results.%s.tab" % (self.occamPhenotype),
                         nullScores)

        ## run pathmark
        system("%s %s -l %s.params -b \"%s\" -f %s -n real_results.all.tab" %
               (sys.executable, pathmarkExec, self.occamPhenotype,
                self.filterParams, self.occamPhenotype))
        if self.nNulls > 0:
            system(
                "%s %s -b \"%s\" -s %s.params -d NULL_%s null_results.%s.tab" %
                (sys.executable, pathmarkExec, self.filterParams,
                 self.occamPhenotype, self.occamPhenotype,
                 self.occamPhenotype))
            self.setFollowOnTarget(
                backgroundPATHMARK(self.occamPhenotype, self.nNulls,
                                   self.directory))
Esempio n. 53
0
    def run(self):
        os.chdir(self.cwd)
        system("rm -f params.txt")
        system("ln -s params%i.txt params.txt" % self.iteration)
        system("mkdir -p outputFiles")

        jfile = open("jobs.list", "r")
        for job in jfile:
            self.addChildTarget(ParadigmCmd(job, self.cwd))
        jfile.close()
        self.setFollowOnTarget(Merge(self.cwd))
Esempio n. 54
0
    def run(self):
        layoutDir = "%s/LAYOUT" % (self.directory)
        os.chdir(layoutDir)

        system("ls %s/*_nodrug.sif | %s %s > stats_%s.tab" %
               (self.occamPhenotype, sys.executable, statisticsExec,
                self.occamPhenotype))
        system(
            "ls NULL_%s/*_nodrug.sif | %s %s -c counts_NULL_%s.tab > stats_NULL_%s.tab"
            % (self.occamPhenotype, sys.executable, statisticsExec,
               self.occamPhenotype, self.occamPhenotype))
        system("../background.R %s" % (self.occamPhenotype))