def _execute(self, starttime, endtime, maxSelect): """ Execute the select command against the Billing DB return the results (possibly summarized) It is guaranteed this function will return an endtime greater than the starttime, but not guaranteed by how much. Note on the time returned as the first part of the tuple: We guarantee two things: a) returned time is strictly greater than starttime b) We return *all* records in the interval [starttime, return time). We do not guarantee that return time == parameter endtime. Thus it is suitable to use as the start time of the next select query. To do this, we reduce the range until it reaches 1 second or the query returns less than maxSelect results. If the interval is one second and it still returns maxSelect results then we extend the limit of the query until all records fit. @param starttime: Datetime object for the start of the query interval. @param endtime: Datetime object for the end of the query interval. @param maxSelect: The maximum number of rows to select @return: Tuple containing the a time that is greater than all the records and the results """ assert starttime < endtime if (maxSelect > MAX_SELECT) and ((endtime-starttime).seconds <= \ MIN_RANGE): raise Exception("Fatal error - more than %i transfers in %i" \ " second(s)." % (MAX_SELECT,(endtime-starttime).seconds)) datestr = str(starttime) datestr_end = str(endtime) # Query the database. If it takes more than MAX_QUERY_TIME_SECS, then # have the probe self-destruct. query=BILLINGDB_SELECT_CMD% ((datestr, datestr_end, datestr, datestr_end, maxSelect)) self._log.debug('_sendToGratia: will execute ' + query) select_time = -time.time() if not TestContainer.isTest(): self._cur.execute(query) result = self._cur.fetchall() else: result = BillingRecSimulator.execute(query) select_time += time.time() if select_time > MAX_QUERY_TIME_SECS: raise Exception("Postgres query took %i seconds, more than " \ "the maximum allowable of %i; this is a sign the DB is " \ "not properly optimized!" % (int(select_time), MAX_QUERY_TIME_SECS)) self._log.debug("BillingDB query finished in %.02f seconds and " \ "returned %i records." % (select_time, len(result))) if not result: self._log.debug("No results from %s to %s." % (starttime, endtime)) return endtime, result # dCache sometimes returns a negative transfer size; when this happens, # it also tosses up a complete garbage duration filtered_result = [] for row in result: row = dict(row) #print row if row['transfersize'] < 0: row['transfersize'] = 0 row['connectiontime'] = 0 filtered_result.append(row) result = filtered_result # If we hit our limit, there's no telling how many identical records # there are on the final millisecond; we must re-query with a smaller # interval or a higher limit on the select. if len(result) == maxSelect: diff = endtime - starttime interval = diff.days*86400 + diff.seconds # Ensure that self._range is such that we always end up on a minute boundary (eventually). # Whenever we decrease the interval size it is guaranteed to be a multiple of what's left # of the interval to the next minute. I.e the transitions are: # 60s -> 30s # 30s -> 15s (which can only happen at :30s) # 15s -> 5s (which can only happen at :15s :30s or :45s) # 5s -> 1s if (interval > 60): new_interval = 60 elif (interval > 30): new_interval = 30 elif (interval > 15): new_interval = 15 elif (interval > 5): new_interval = 5 else: new_interval = 1 new_endtime = starttime + datetime.timedelta(0, new_interval) # Guard against the DST jump by making sure new_endtime > starttime. if (interval == new_interval) or (new_interval == 0) or \ (new_endtime <= starttime): self._log.warning("Limit hit; increasing from %i to %i." % \ (maxSelect, maxSelect*2)) endtime, result = self._execute(starttime, endtime, maxSelect*2) assert endtime > starttime return endtime, result else: self._log.warning("Limit hit; decreasing time interval from %i" \ " to %i." % (interval, new_interval)) self._range = new_interval endtime, result = self._execute(starttime, new_endtime, maxSelect) assert endtime > starttime return endtime, result return endtime, result
global recordsToSend if (not TEST): return log.info("Send to gratia:") dump(log, createStatistics(recordsToSend)) log.info("Generated:") dump(log, createStatistics(BillingRecSimulator.sqlTableContent)) def dump(log, (overall, initiator, errorcode, totalRecords)): log.info("Overall %s" % overall) log.info("initiator %s" % initiator) log.info("errorcode %s" % errorcode) log.info("num records %s" % totalRecords) if __name__ == "__main__": recordsToSend = BillingRecSimulator.generateTableContent() print "Pre aggregation" print createStatistics(recordsToSend) recordsToSend = Collapse.collapse( recordsToSend, TimeBinRange.DictRecordAggregator( ['initiator', 'client', 'protocol', 'errorcode', 'isnew'], ['njobs', 'transfersize', 'connectiontime'])) print "Post Aggregation" print createStatistics(recordsToSend)
return sum def dumpStatistics(log): global recordsToSend if ( not TEST ): return log.info("Send to gratia:") dump(log,createStatistics(recordsToSend)) log.info("Generated:") dump(log,createStatistics(BillingRecSimulator.sqlTableContent)) def dump(log,(overall,initiator,errorcode,totalRecords)): log.info("Overall %s" % overall) log.info("initiator %s"% initiator) log.info("errorcode %s" % errorcode) log.info("num records %s" % totalRecords) if __name__ == "__main__": recordsToSend = BillingRecSimulator.generateTableContent() print "Pre aggregation" print createStatistics(recordsToSend) recordsToSend = Collapse.collapse(recordsToSend,TimeBinRange.DictRecordAggregator(['initiator','client', 'protocol','errorcode','isnew' ],['njobs','transfersize','connectiontime'])) print "Post Aggregation" print createStatistics(recordsToSend)
def _execute(self, starttime, endtime, maxSelect): """ Execute the select command against the Billing DB return the results (possibly summarized) It is guaranteed this function will return an endtime greater than the starttime, but not guaranteed by how much. Note on the time returned as the first part of the tuple: We guarantee two things: a) returned time is strictly greater than starttime b) We return *all* records in the interval [starttime, return time). We do not guarantee that return time == parameter endtime. Thus it is suitable to use as the start time of the next select query. To do this, we reduce the range until it reaches 1 second or the query returns less than maxSelect results. If the interval is one second and it still returns maxSelect results then we extend the limit of the query until all records fit. @param starttime: Datetime object for the start of the query interval. @param endtime: Datetime object for the end of the query interval. @param maxSelect: The maximum number of rows to select @return: Tuple containing the a time that is greater than all the records and the results """ assert starttime < endtime if (maxSelect > MAX_SELECT) and ((endtime-starttime).seconds <= \ MIN_RANGE): raise Exception("Fatal error - more than %i transfers in %i" \ " second(s)." % (MAX_SELECT,(endtime-starttime).seconds)) datestr = str(starttime) datestr_end = str(endtime) # Query the database. If it takes more than MAX_QUERY_TIME_SECS, then # have the probe self-destruct. query = BILLINGDB_SELECT_CMD % ( (datestr, datestr_end, datestr, datestr_end, maxSelect)) self._log.debug('_sendToGratia: will execute ' + query) select_time = -time.time() if not TestContainer.isTest(): self._cur.execute(query) result = self._cur.fetchall() else: result = BillingRecSimulator.execute(query) select_time += time.time() if select_time > MAX_QUERY_TIME_SECS: raise Exception("Postgres query took %i seconds, more than " \ "the maximum allowable of %i; this is a sign the DB is " \ "not properly optimized!" % (int(select_time), MAX_QUERY_TIME_SECS)) self._log.debug("BillingDB query finished in %.02f seconds and " \ "returned %i records." % (select_time, len(result))) if not result: self._log.debug("No results from %s to %s." % (starttime, endtime)) return endtime, result # dCache sometimes returns a negative transfer size; when this happens, # it also tosses up a complete garbage duration filtered_result = [] for row in result: row = dict(row) #print row if row['transfersize'] < 0: row['transfersize'] = 0 row['connectiontime'] = 0 filtered_result.append(row) result = filtered_result # If we hit our limit, there's no telling how many identical records # there are on the final millisecond; we must re-query with a smaller # interval or a higher limit on the select. if len(result) == maxSelect: diff = endtime - starttime interval = diff.days * 86400 + diff.seconds # Ensure that self._range is such that we always end up on a minute boundary (eventually). # Whenever we decrease the interval size it is guaranteed to be a multiple of what's left # of the interval to the next minute. I.e the transitions are: # 60s -> 30s # 30s -> 15s (which can only happen at :30s) # 15s -> 5s (which can only happen at :15s :30s or :45s) # 5s -> 1s if (interval > 60): new_interval = 60 elif (interval > 30): new_interval = 30 elif (interval > 15): new_interval = 15 elif (interval > 5): new_interval = 5 else: new_interval = 1 new_endtime = starttime + datetime.timedelta(0, new_interval) # Guard against the DST jump by making sure new_endtime > starttime. if (interval == new_interval) or (new_interval == 0) or \ (new_endtime <= starttime): self._log.warning("Limit hit; increasing from %i to %i." % \ (maxSelect, maxSelect*2)) endtime, result = self._execute(starttime, endtime, maxSelect * 2) assert endtime > starttime return endtime, result else: self._log.warning("Limit hit; decreasing time interval from %i" \ " to %i." % (interval, new_interval)) self._range = new_interval endtime, result = self._execute(starttime, new_endtime, maxSelect) assert endtime > starttime return endtime, result return endtime, result