def run(self, fileStore): # Trim outgroup, convert outgroup coordinates, and add to # outgroup fragments dir outgroupSequenceFiles = [ fileStore.readGlobalFile(fileID) for fileID in self.outgroupSequenceIDs ] mostRecentResultsFile = fileStore.readGlobalFile( self.mostRecentResultsID) trimmedOutgroup = fileStore.getLocalTempFile() outgroupCoverage = fileStore.getLocalTempFile() calculateCoverage(outgroupSequenceFiles[0], mostRecentResultsFile, outgroupCoverage) # The windowSize and threshold are fixed at 1: anything more # and we will run into problems with alignments that aren't # covered in a matching trimmed sequence. trimSequences(outgroupSequenceFiles[0], outgroupCoverage, trimmedOutgroup, flanking=self.blastOptions.trimOutgroupFlanking, windowSize=1, threshold=1) outgroupConvertedResultsFile = fileStore.getLocalTempFile() with open(outgroupConvertedResultsFile, 'w') as f: upconvertCoords(cigarPath=mostRecentResultsFile, fastaPath=trimmedOutgroup, contigNum=1, outputFile=f) self.outgroupFragmentIDs.append( fileStore.writeGlobalFile(trimmedOutgroup)) sequenceFiles = [ fileStore.readGlobalFile(path) for path in self.sequenceIDs ] untrimmedSequenceFiles = [ fileStore.readGlobalFile(path) for path in self.untrimmedSequenceIDs ] # Report coverage of the latest outgroup on the trimmed ingroups. for trimmedIngroupSequence, ingroupSequence, ingroupName in zip( sequenceFiles, untrimmedSequenceFiles, self.ingroupNames): tmpIngroupCoverage = fileStore.getLocalTempFile() calculateCoverage(trimmedIngroupSequence, mostRecentResultsFile, tmpIngroupCoverage) fileStore.logToMaster( "Coverage on %s from outgroup #%d, %s: %s%% (current ingroup length %d, untrimmed length %d). Outgroup trimmed to %d bp from %d" % (ingroupName, self.outgroupNumber, self.outgroupNames[self.outgroupNumber - 1], percentCoverage(trimmedIngroupSequence, tmpIngroupCoverage), sequenceLength(trimmedIngroupSequence), sequenceLength(ingroupSequence), sequenceLength(trimmedOutgroup), sequenceLength(outgroupSequenceFiles[0]))) # Convert the alignments' ingroup coordinates. ingroupConvertedResultsFile = fileStore.getLocalTempFile() if self.sequenceIDs == self.untrimmedSequenceIDs: # No need to convert ingroup coordinates on first run. shutil.copy(outgroupConvertedResultsFile, ingroupConvertedResultsFile) else: cactus_call(parameters=[ "cactus_blast_convertCoordinates", "--onlyContig1", outgroupConvertedResultsFile, ingroupConvertedResultsFile, "1" ]) # Append the latest results to the accumulated outgroup coverage file if self.outgroupResultsID: outgroupResultsFile = fileStore.readGlobalFile( self.outgroupResultsID, mutable=True) else: outgroupResultsFile = fileStore.getLocalTempFile() with open(ingroupConvertedResultsFile) as results: with open(outgroupResultsFile, 'a') as output: output.write(results.read()) self.outgroupResultsID = fileStore.writeGlobalFile(outgroupResultsFile) # Report coverage of the all outgroup alignments so far on the ingroups. ingroupCoverageFiles = [] self.ingroupCoverageIDs = [] for ingroupSequence, ingroupName in zip(untrimmedSequenceFiles, self.ingroupNames): ingroupCoverageFile = fileStore.getLocalTempFile() calculateCoverage( sequenceFile=ingroupSequence, cigarFile=outgroupResultsFile, outputFile=ingroupCoverageFile, depthById=self.blastOptions.trimOutgroupDepth > 1) ingroupCoverageFiles.append(ingroupCoverageFile) self.ingroupCoverageIDs.append( fileStore.writeGlobalFile(ingroupCoverageFile)) fileStore.logToMaster( "Cumulative coverage of %d outgroups on ingroup %s: %s" % (self.outgroupNumber, ingroupName, percentCoverage(ingroupSequence, ingroupCoverageFile))) if len(self.outgroupSequenceIDs) > 1: # Trim ingroup seqs and recurse on the next outgroup. trimmedSeqs = [] # Use the accumulated results so far to trim away the # aligned parts of the ingroups. for i, sequenceFile in enumerate(untrimmedSequenceFiles): outgroupCoverageFile = ingroupCoverageFiles[i] selfCoverageFile = fileStore.getLocalTempFile() coverageFile = fileStore.getLocalTempFile() if self.blastOptions.keepParalogs: subtractBed(outgroupCoverageFile, selfCoverageFile, coverageFile) else: coverageFile = outgroupCoverageFile trimmed = fileStore.getLocalTempFile() trimSequences(sequenceFile, coverageFile, trimmed, complement=True, flanking=self.blastOptions.trimFlanking, minSize=self.blastOptions.trimMinSize, threshold=self.blastOptions.trimThreshold, windowSize=self.blastOptions.trimWindowSize, depth=self.blastOptions.trimOutgroupDepth) trimmedSeqs.append(trimmed) trimmedSeqIDs = [ fileStore.writeGlobalFile(path, cleanup=True) for path in trimmedSeqs ] return self.addChild( BlastFirstOutgroup( ingroupNames=self.ingroupNames, untrimmedSequenceIDs=self.untrimmedSequenceIDs, sequenceIDs=trimmedSeqIDs, outgroupNames=self.outgroupNames, outgroupSequenceIDs=self.outgroupSequenceIDs[1:], outgroupFragmentIDs=self.outgroupFragmentIDs, outgroupResultsID=self.outgroupResultsID, blastOptions=self.blastOptions, outgroupNumber=self.outgroupNumber + 1, ingroupCoverageIDs=self.ingroupCoverageIDs)).rv() else: # Finally, put the ingroups and outgroups results together return (self.outgroupResultsID, self.outgroupFragmentIDs, self.ingroupCoverageIDs)
def run(self, fileStore): # Trim outgroup, convert outgroup coordinates, and add to # outgroup fragments dir outgroupSequenceFiles = [fileStore.readGlobalFile(fileID) for fileID in self.outgroupSequenceIDs] mostRecentResultsFile = fileStore.readGlobalFile(self.mostRecentResultsID) trimmedOutgroup = fileStore.getLocalTempFile() outgroupCoverage = fileStore.getLocalTempFile() calculateCoverage(outgroupSequenceFiles[0], mostRecentResultsFile, outgroupCoverage) # The windowSize and threshold are fixed at 1: anything more # and we will run into problems with alignments that aren't # covered in a matching trimmed sequence. trimSequences(outgroupSequenceFiles[0], outgroupCoverage, trimmedOutgroup, flanking=self.blastOptions.trimOutgroupFlanking, windowSize=1, threshold=1) outgroupConvertedResultsFile = fileStore.getLocalTempFile() with open(outgroupConvertedResultsFile, 'w') as f: upconvertCoords(cigarPath=mostRecentResultsFile, fastaPath=trimmedOutgroup, contigNum=1, outputFile=f) self.outgroupFragmentIDs.append(fileStore.writeGlobalFile(trimmedOutgroup)) sequenceFiles = [fileStore.readGlobalFile(path) for path in self.sequenceIDs] untrimmedSequenceFiles = [fileStore.readGlobalFile(path) for path in self.untrimmedSequenceIDs] # Report coverage of the latest outgroup on the trimmed ingroups. for trimmedIngroupSequence, ingroupSequence, ingroupName in zip(sequenceFiles, untrimmedSequenceFiles, self.ingroupNames): tmpIngroupCoverage = fileStore.getLocalTempFile() calculateCoverage(trimmedIngroupSequence, mostRecentResultsFile, tmpIngroupCoverage) fileStore.logToMaster("Coverage on %s from outgroup #%d, %s: %s%% (current ingroup length %d, untrimmed length %d). Outgroup trimmed to %d bp from %d" % (ingroupName, self.outgroupNumber, self.outgroupNames[self.outgroupNumber - 1], percentCoverage(trimmedIngroupSequence, tmpIngroupCoverage), sequenceLength(trimmedIngroupSequence), sequenceLength(ingroupSequence), sequenceLength(trimmedOutgroup), sequenceLength(outgroupSequenceFiles[0]))) # Convert the alignments' ingroup coordinates. ingroupConvertedResultsFile = fileStore.getLocalTempFile() if self.sequenceIDs == self.untrimmedSequenceIDs: # No need to convert ingroup coordinates on first run. shutil.copy(outgroupConvertedResultsFile, ingroupConvertedResultsFile) else: cactus_call(parameters=["cactus_blast_convertCoordinates", "--onlyContig1", outgroupConvertedResultsFile, ingroupConvertedResultsFile, "1"]) # Append the latest results to the accumulated outgroup coverage file if self.outgroupResultsID: outgroupResultsFile = fileStore.readGlobalFile(self.outgroupResultsID, mutable=True) else: outgroupResultsFile = fileStore.getLocalTempFile() with open(ingroupConvertedResultsFile) as results: with open(outgroupResultsFile, 'a') as output: output.write(results.read()) self.outgroupResultsID = fileStore.writeGlobalFile(outgroupResultsFile) # Report coverage of the all outgroup alignments so far on the ingroups. ingroupCoverageFiles = [] self.ingroupCoverageIDs = [] for ingroupSequence, ingroupName in zip(untrimmedSequenceFiles, self.ingroupNames): ingroupCoverageFile = fileStore.getLocalTempFile() calculateCoverage(sequenceFile=ingroupSequence, cigarFile=outgroupResultsFile, outputFile=ingroupCoverageFile, depthById=self.blastOptions.trimOutgroupDepth > 1) ingroupCoverageFiles.append(ingroupCoverageFile) self.ingroupCoverageIDs.append(fileStore.writeGlobalFile(ingroupCoverageFile)) fileStore.logToMaster("Cumulative coverage of %d outgroups on ingroup %s: %s" % (self.outgroupNumber, ingroupName, percentCoverage(ingroupSequence, ingroupCoverageFile))) if len(self.outgroupSequenceIDs) > 1: # Trim ingroup seqs and recurse on the next outgroup. trimmedSeqs = [] # Use the accumulated results so far to trim away the # aligned parts of the ingroups. for i, sequenceFile in enumerate(untrimmedSequenceFiles): outgroupCoverageFile = ingroupCoverageFiles[i] selfCoverageFile = fileStore.getLocalTempFile() coverageFile = fileStore.getLocalTempFile() if self.blastOptions.keepParalogs: subtractBed(outgroupCoverageFile, selfCoverageFile, coverageFile) else: coverageFile = outgroupCoverageFile trimmed = fileStore.getLocalTempFile() trimSequences(sequenceFile, coverageFile, trimmed, complement=True, flanking=self.blastOptions.trimFlanking, minSize=self.blastOptions.trimMinSize, threshold=self.blastOptions.trimThreshold, windowSize=self.blastOptions.trimWindowSize, depth=self.blastOptions.trimOutgroupDepth) trimmedSeqs.append(trimmed) trimmedSeqIDs = [fileStore.writeGlobalFile(path, cleanup=True) for path in trimmedSeqs] return self.addChild(BlastFirstOutgroup( ingroupNames=self.ingroupNames, untrimmedSequenceIDs=self.untrimmedSequenceIDs, sequenceIDs=trimmedSeqIDs, outgroupNames=self.outgroupNames, outgroupSequenceIDs=self.outgroupSequenceIDs[1:], outgroupFragmentIDs=self.outgroupFragmentIDs, outgroupResultsID=self.outgroupResultsID, blastOptions=self.blastOptions, outgroupNumber=self.outgroupNumber + 1, ingroupCoverageIDs=self.ingroupCoverageIDs)).rv() else: # Finally, put the ingroups and outgroups results together return (self.outgroupResultsID, self.outgroupFragmentIDs, self.ingroupCoverageIDs)