def sendBillingInfoRecordsToGratia(self): """ This is the public method for starting the dCache-transfer reporting. This will query records no more than _maxAge old, and always starts queries on hour time boundaries (i.e., 1:00:00 not 1:02:00). This will continue to query until we hit records starting less than 75 minutes ago, then return. By default, we start with querying 60-second intervals, but will shrink this window if we encounter lots of data. If not summarizing: this method uses _execute to get all the data for a given interval, then uses _processResults to send them to Gratia. Once the query for a time interval is done, then we immediately checkpoint. If summarizing: this method continues to query until it hits the end of an hour interval. At that point, it summarizes once again, and sends the summaries up to Gratia. We then only checkpoint on the hour. """ self._log.debug("sendBillingInfoRecordsToGratia") # Query no more than a set number of days in the past minTime = datetime.datetime.now() - datetime.timedelta(self._maxAge, 0) minTime = datetime.datetime(minTime.year, minTime.month, minTime.day, minTime.hour, 0, 0) # The latest allowed record is 75 minutes in the past, in order to make # sure we only query complete intervals latestAllowed = datetime.datetime.now() - datetime.timedelta(0, 75*60) if ( TestContainer.isTest() ): latestAllowed = TestContainer.getEndDateTime() # Start with either the last checkpoint or minTime days ago, whichever # is more recent. starttime = max(self._BIcheckpoint.lastDateStamp(), minTime) self._log.info("Starting queries at time %s." % starttime) dictRecordAgg = TimeBinRange.DictRecordAggregator(DCACHE_AGG_FIELDS, DCACHE_SUM_FIELDS) nextSummary = self._determineNextEndtime(starttime, summary=True) if self._summarize: self._log.debug("Next summary send time: %s." % nextSummary) results = [] endtime = self._determineNextEndtime(starttime) totalRecords = 0 # Loop until we have caught up to latestAllowed. while starttime < latestAllowed: assert starttime < endtime self._log.debug('sendBillingInfoRecordsToGratia: Processing ' \ 'starting at %s.' % starttime) # We are guaranteed that starttime will move forward to the value of # endtime every time we call execute. next_starttime, rows = self._execute(starttime, endtime, self._maxSelect) results += rows totalRecords += len(rows) if self._summarize: # Summarize the partial results results = Collapse.collapse(results, dictRecordAgg) assert next_starttime > starttime next_endtime = self._determineNextEndtime(next_starttime) # If we're not summarizing, we send up records each loop. if (not self._summarize) and results: totalRecords = 0 # We now have all the rows we want; process them self._BIcheckpoint.createPending(endtime, '') self._processResults(results) self._BIcheckpoint.commit() if (self._range < STARTING_RANGE and len(results)*4 < \ self._maxSelect): self._range = STARTING_RANGE results = [] # If we are summarizing, send records only per hour of data elif (next_endtime > nextSummary) and results: num_agg = totalRecords - len(results) if num_agg: factor = float(totalRecords)/float(len(results)) self._log.info("Aggregated %i of %i records for time " \ "interval ending in %s. %.1fx reduction." % \ (num_agg, totalRecords, nextSummary, factor)) else: self._log.debug("Unable to aggregate any of %i records" \ % totalRecords) totalRecords = 0 self._BIcheckpoint.createPending(nextSummary, '') self._processResults(results) self._BIcheckpoint.commit() results = [] self._range = STARTING_RANGE nextSummary = self._determineNextEndtime(next_starttime, summary=True) endtime = next_endtime starttime = next_starttime # Check to see if the stop file has been created. If so, break if os.path.exists(self._stopFileName): #Neha - 03/17/2011 #Don't need to commit anything since we are only doing select and no inserts or updates self._cur.close() self._connection.close() break
return sum def dumpStatistics(log): global recordsToSend if ( not TEST ): return log.info("Send to gratia:") dump(log,createStatistics(recordsToSend)) log.info("Generated:") dump(log,createStatistics(BillingRecSimulator.sqlTableContent)) def dump(log,(overall,initiator,errorcode,totalRecords)): log.info("Overall %s" % overall) log.info("initiator %s"% initiator) log.info("errorcode %s" % errorcode) log.info("num records %s" % totalRecords) if __name__ == "__main__": recordsToSend = BillingRecSimulator.generateTableContent() print "Pre aggregation" print createStatistics(recordsToSend) recordsToSend = Collapse.collapse(recordsToSend,TimeBinRange.DictRecordAggregator(['initiator','client', 'protocol','errorcode','isnew' ],['njobs','transfersize','connectiontime'])) print "Post Aggregation" print createStatistics(recordsToSend)
global recordsToSend if (not TEST): return log.info("Send to gratia:") dump(log, createStatistics(recordsToSend)) log.info("Generated:") dump(log, createStatistics(BillingRecSimulator.sqlTableContent)) def dump(log, (overall, initiator, errorcode, totalRecords)): log.info("Overall %s" % overall) log.info("initiator %s" % initiator) log.info("errorcode %s" % errorcode) log.info("num records %s" % totalRecords) if __name__ == "__main__": recordsToSend = BillingRecSimulator.generateTableContent() print "Pre aggregation" print createStatistics(recordsToSend) recordsToSend = Collapse.collapse( recordsToSend, TimeBinRange.DictRecordAggregator( ['initiator', 'client', 'protocol', 'errorcode', 'isnew'], ['njobs', 'transfersize', 'connectiontime'])) print "Post Aggregation" print createStatistics(recordsToSend)
def runPipeline(sampleName, sampleDir, cleanup=False): """ Run all scripts in the Dellingr pipeline on the specified sample :param sampleName: A string containing the sample name, for status message updates :param sampleDir: A string containg the filepath to the base sample directory :param cleanup: A boolean indicating if temporary files should be deleted :return: """ def runBWA(configPath): """ Aligns the reads in the specified FASTQ files using the Burrows-Wheeler aligner In addition, a read group is added, and the resulting BAM file is sorted :param configPath: A string containing a filepath to a ini file listing bwa's parameters :return: None """ sys.stderr.write("\t".join( [printPrefix, time.strftime('%X'), "Running BWA...\n"])) # Read the arguments from the config file try: bwaConfig = ConfigObj(configPath)["bwa"] except KeyError: # Thrown if the section is not labelled "bwa" sys.stderr.write( "ERROR: The config file \'%s\' does not appear to be a bwa config, as no section is labelled \'bwa\'\n" % (configPath)) exit(1) # Parse the arguments from the config file in the required order try: bwaCommand = [ bwaConfig["bwa"], "mem", bwaConfig["reference"], bwaConfig["input"][0], bwaConfig["input"][1], ] if bwaConfig[ "fastqComment"] == "True": # We need to append the barcode sequence to the output BAM file bwaCommand.insert(2, "-C") sortCommand = [ bwaConfig["samtools"], "sort", "-O", "BAM", "-o", bwaConfig["output"] ] # To supress BWA's status messages, we are going to buffer the stderr stream of every process into a variable # If BWA or a samtools task crashes (exit code != 0), we will print out everything that is buffered bwaStderr = [] samtoolsStderr = [] bwaCounter = 0 try: bwaCom = subprocess.Popen(bwaCommand, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sortCom = subprocess.Popen(sortCommand, stdin=bwaCom.stdout, stderr=subprocess.PIPE) # Parse through the stderr lines of BWA and samtools, and buffer them as necessary for bwaLine in bwaCom.stderr: # If this line indicates the progress of BWA, print it out bwaLine = bwaLine.decode("utf-8") if bwaLine.startswith("[M::mem_process_seqs]"): bwaCounter += int(bwaLine.split(" ")[2]) sys.stderr.write("\t".join([ printPrefix, time.strftime('%X'), "Reads Processed:" + str(bwaCounter) + "\n" ])) bwaStderr.append(bwaLine) for samtoolsLine in sortCom.stderr: samtoolsStderr.append(samtoolsLine.decode("utf-8")) bwaCom.stdout.close() bwaCom.wait() sortCom.wait() if bwaCom.returncode != 0 or sortCom.returncode != 0: # i.e. Something crashed raise subprocess.CalledProcessError() except BaseException as e: # Either a program crashed, or something is hanging and the user has force quit # To be safe, print out debugging info sys.stderr.write( "ERROR: BWA and Samtools encountered an unexpected error and were terminated" + os.linesep) sys.stderr.write("BWA Standard Error Stream:" + os.linesep) sys.stderr.write("".join(bwaStderr)) sys.stderr.write("Samtools Sort Standard Error Stream:" + os.linesep) sys.stderr.write("".join(samtoolsStderr)) raise e sys.stderr.write("\t".join( [printPrefix, time.strftime('%X'), "Mapping Complete\n"])) except KeyError as e: # i.e. A required argument is missing from the config file sys.stderr.write( "ERROR: Unable to locate a required argument in the bwa config file \'%s\'\n" % (configPath)) raise e def sortAndRetag(inFile, outFile, bwaConfigPath): """ Recalculare the MD and NM tags of the secified SAM file, and sort it :param inFile: A string containing a filepath to an input BAM file. Usually generated by clipOverlap :param outFile: A string containing an output filepath :param bwaConfigPath: A string containing a filepath to the BWA config file :return: """ # Parse the reference genome location from the BWA config file bwaConfig = ConfigObj(bwaConfigPath)["bwa"] refGenome = bwaConfig["reference"] calmdCom = ["samtools", "calmd", inFile, refGenome, "-b"] # Recalculate MD and NM tags sortCom = ["samtools", "sort", "-o", outFile] # To cleanup the terminal, we are going to buffer the stderr stream of every process into a variable # If a samtools task crashes (exit code != 0), we will print out everything that is buffered calmdStderr = [] sortStderr = [] calmdTask = subprocess.Popen(calmdCom, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sortTask = subprocess.Popen(sortCom, stdin=calmdTask.stdout, stderr=subprocess.PIPE) # Parse through the stderr lines of samtools, and buffer them as necessary for calmdLine in calmdTask.stderr: calmdStderr.append(calmdLine.decode("utf-8")) for sortLine in sortTask.stderr: sortStderr.append(sortLine.decode("utf-8")) calmdTask.stdout.close() calmdTask.wait() sortTask.wait() if calmdTask.returncode != 0 or sortTask.returncode != 0: # i.e. Something crashed sys.stderr.write( "ERROR: Samtools encountered an unexpected error and was terminated\n" ) sys.stderr.write("Samtools calmd Standard Error Stream:\n") sys.stderr.write("\n".join(calmdStderr)) sys.stderr.write("Samtools Sort Standard Error Stream:\n") sys.stderr.write("\n".join(sortStderr)) exit(1) # Finally, index the BAM file subprocess.check_call(["samtools", "index", outFile]) printPrefix = "DELLINGR-MAIN\t\t" + sampleName sys.stderr.write("\t".join([ printPrefix, time.strftime('%X'), "Processing Sample \'%s\'\n" % sampleName.rstrip() ])) # Run Trim trimDone = os.path.join( sampleDir, "config", "Trim_Complete") # Similar to Make's "TASK_COMPLETE" file if not os.path.exists( trimDone ): # i.e. Did Trim already complete for this sample? If so, do not re-run it trimConfig = os.path.join( sampleDir, "config", "trim_task.ini") # Where is Trim's config file? # Is there a trim config file? If not, then we don't need to run trim, as the sample doesn't have barcodes if os.path.exists(trimConfig): trimPrintPrefix = "DELLINGR-TRIM\t\t" + sampleName Trim.main(sysStdin=["--config", trimConfig], printPrefix=trimPrintPrefix) # Actually run Trim open(trimDone, "w").close( ) # After Trim completes, it will create this file, signifying to the end user that this task completed # Run bwa bwaDone = os.path.join(sampleDir, "config", "BWA_Complete") if not os.path.exists(bwaDone): bwaConfig = os.path.join(sampleDir, "config", "bwa_task.ini") runBWA(bwaConfig) open(bwaDone, "w").close() # Run Collapse collapseDone = os.path.join(sampleDir, "config", "Collapse_Complete") if not os.path.exists(collapseDone): collapseConfig = os.path.join(sampleDir, "config", "collapse_task.ini") collapsePrintPrefix = "DELLINGR-COLLAPSE\t" + sampleName Collapse.main(sysStdin=["--config", collapseConfig], printPrefix=collapsePrintPrefix) # Sort the collapse BAM output output # Parse the config file for the output file name collapseConfArgs = ConfigObj(collapseConfig) sortInput = collapseConfArgs["collapse"]["output"] # Append "sort" as the output file name sortOutput = sortInput.replace(".bam", ".sort.bam") tmpDir = os.sep + "tmp" + os.sep resultsDir = os.sep + "results" + os.sep sortOutput = sortOutput.replace(tmpDir, resultsDir) sys.stderr.write("\t".join([ printPrefix, time.strftime('%X'), "Sorting final BAM file, and recalculating tags...\n" ])) bwaConfig = os.path.join(sampleDir, "config", "bwa_task.ini") sortAndRetag(sortInput, sortOutput, bwaConfig) open(collapseDone, "w").close() # Run call (variant calling) callDone = os.path.join(sampleDir, "config", "Call_Complete") pipelineDone = os.path.join(sampleDir, "config", "Pipeline_Complete") if not os.path.exists(callDone): callConfig = os.path.join(sampleDir, "config", "call_task.ini") callPrintPrefix = "DELLINGR-CALL\t\t" + sampleName Call.main(sysStdin=["--config", callConfig], printPrefix=callPrintPrefix) open(callDone, "w").close() # Mark this sample as fully processed open(pipelineDone, "w").close() # Cleanup intermediate files (if specified) if cleanup: tmpDir = os.path.join(sampleDir, "tmp") tmpFiles = os.listdir(tmpDir) for tmpFile in tmpFiles: os.remove(os.path.join(tmpDir, tmpFile)) sys.stderr.write("\t".join([ printPrefix, time.strftime('%X'), "%s: Pipeline Complete\n" % sampleName.rstrip() ]))
def sendBillingInfoRecordsToGratia(self): """ This is the public method for starting the dCache-transfer reporting. This will query records no more than _maxAge old, and always starts queries on hour time boundaries (i.e., 1:00:00 not 1:02:00). This will continue to query until we hit records starting less than 75 minutes ago, then return. By default, we start with querying 60-second intervals, but will shrink this window if we encounter lots of data. If not summarizing: this method uses _execute to get all the data for a given interval, then uses _processResults to send them to Gratia. Once the query for a time interval is done, then we immediately checkpoint. If summarizing: this method continues to query until it hits the end of an hour interval. At that point, it summarizes once again, and sends the summaries up to Gratia. We then only checkpoint on the hour. """ self._log.debug("sendBillingInfoRecordsToGratia") # Query no more than a set number of days in the past minTime = datetime.datetime.now() - datetime.timedelta(self._maxAge, 0) minTime = datetime.datetime(minTime.year, minTime.month, minTime.day, minTime.hour, 0, 0) # The latest allowed record is 75 minutes in the past, in order to make # sure we only query complete intervals latestAllowed = datetime.datetime.now() - datetime.timedelta( 0, 75 * 60) if (TestContainer.isTest()): latestAllowed = TestContainer.getEndDateTime() # Start with either the last checkpoint or minTime days ago, whichever # is more recent. starttime = max(self._BIcheckpoint.lastDateStamp(), minTime) self._log.info("Starting queries at time %s." % starttime) dictRecordAgg = TimeBinRange.DictRecordAggregator( DCACHE_AGG_FIELDS, DCACHE_SUM_FIELDS) nextSummary = self._determineNextEndtime(starttime, summary=True) if self._summarize: self._log.debug("Next summary send time: %s." % nextSummary) results = [] endtime = self._determineNextEndtime(starttime) totalRecords = 0 # Loop until we have caught up to latestAllowed. while starttime < latestAllowed: assert starttime < endtime self._log.debug('sendBillingInfoRecordsToGratia: Processing ' \ 'starting at %s.' % starttime) # We are guaranteed that starttime will move forward to the value of # endtime every time we call execute. next_starttime, rows = self._execute(starttime, endtime, self._maxSelect) results += rows totalRecords += len(rows) if self._summarize: # Summarize the partial results results = Collapse.collapse(results, dictRecordAgg) assert next_starttime > starttime next_endtime = self._determineNextEndtime(next_starttime) # If we're not summarizing, we send up records each loop. if (not self._summarize) and results: totalRecords = 0 # We now have all the rows we want; process them self._BIcheckpoint.createPending(endtime, '') self._processResults(results) self._BIcheckpoint.commit() if (self._range < STARTING_RANGE and len(results)*4 < \ self._maxSelect): self._range = STARTING_RANGE results = [] # If we are summarizing, send records only per hour of data elif (next_endtime > nextSummary) and results: num_agg = totalRecords - len(results) if num_agg: factor = float(totalRecords) / float(len(results)) self._log.info("Aggregated %i of %i records for time " \ "interval ending in %s. %.1fx reduction." % \ (num_agg, totalRecords, nextSummary, factor)) else: self._log.debug("Unable to aggregate any of %i records" \ % totalRecords) totalRecords = 0 self._BIcheckpoint.createPending(nextSummary, '') self._processResults(results) self._BIcheckpoint.commit() results = [] self._range = STARTING_RANGE nextSummary = self._determineNextEndtime(next_starttime, summary=True) endtime = next_endtime starttime = next_starttime # Check to see if the stop file has been created. If so, break if os.path.exists(self._stopFileName): #Neha - 03/17/2011 #Don't need to commit anything since we are only doing select and no inserts or updates self._cur.close() self._connection.close() break