def del_table(tbl): related_tables = ((tbl.options or {}).get('related_tables')) for ref in (related_tables or {}).values(): try: del_table(Table.from_ref(ref)) except ObjectDoesNotExist: # already deleted pass Column.objects.filter(table=tbl.id).delete() Job.objects.filter(table=tbl.id).delete() for trigger in TriggerCache.filter(tbl): trigger.delete() for handler in ErrorHandlerCache.filter(tbl): handler.delete() # delete newly unreferenced routes Destination.objects.filter(trigger=None).delete() tables = (tbl.options or {}).get('tables') for ref in (tables or {}).values(): try: del_table(Table.from_ref(ref)) except ObjectDoesNotExist: # already deleted pass tbl.delete()
def analyze(self, jobs): criteria = self.job.criteria sharks_query_table = Table.from_ref( self.table.options.related_tables['basetable']) depjobs = {} # For every (shark, job), we spin off a new job to grab the data, then # merge everything into one dataframe at the end. for s in Device.objects.filter(module='netshark', enabled=True): shark = DeviceManager.get_device(s.id) for capjob in shark.get_capture_jobs(): # Start with criteria from the primary table -- this gives us # endtime, duration and netshark_filterexpr. bytes_criteria = copy.copy(criteria) bytes_criteria.netshark_device = s.id bytes_criteria.netshark_source_name = 'jobs/' + capjob.name bytes_criteria.resolution = datetime.timedelta(0, 1) bytes_criteria.aggregated = True job = Job.create(table=sharks_query_table, criteria=bytes_criteria) depjobs[job.id] = job return QueryContinue(self.collect, depjobs)
def analyze(self, jobs=None): download_table = Table.from_ref( self.table.options.related_tables['download_table']) # Create source and destination download jobs depjobs = {} c = self.job.criteria sharks = [ ('1-source', c.netshark_device_src, c.netshark_source_name_src), ('2-dest', c.netshark_device_dst, c.netshark_source_name_dst) ] for shark in sharks: sc = copy.copy(c) name, device, source = shark sc.netshark_device = device sc.netshark_source_name = source sc.segment = name job = Job.create(table=download_table, criteria=sc, update_progress=True, parent=self.job) logger.debug("Created %s: %s download job with criteria %s" % (job, name, sc)) depjobs[job.id] = job return QueryContinue(self.collect, depjobs)
def encode(source): """Normalize source values to hashable type for lookups.""" # require a hashable object, see here for simple way to hash dicts: # http://stackoverflow.com/a/16162138/2157429 from steelscript.appfwk.apps.datasource.models import Table return frozenset(Table.to_ref(source).itervalues())
def analyze(self, jobs): criteria = self.job.criteria tzname = criteria.business_hours_tzname tz = pytz.timezone(tzname) times = jobs['times'].data() if times is None or len(times) == 0: return QueryComplete(None) basetable = Table.from_ref( self.table.options.related_tables['basetable'] ) # Create all the jobs depjobs = {} for i, row in times.iterrows(): (t0, t1) = (row['starttime'], row['endtime']) sub_criteria = copy.copy(criteria) sub_criteria.starttime = t0.astimezone(tz) sub_criteria.endtime = t1.astimezone(tz) job = Job.create(table=basetable, criteria=sub_criteria, update_progress=False, parent=self.job) logger.debug("Created %s: %s - %s" % (job, t0, t1)) depjobs[job.id] = job return QueryContinue(self.collect, depjobs)
def analyze(self, jobs): criteria = self.job.criteria ar_query_table = Table.from_ref( self.table.options.related_tables['basetable'] ) depjobs = {} # For every (ar, job), we spin off a new job to grab the data, then # merge everything into one dataframe at the end. for s in Device.objects.filter(module='appresponse', enabled=True): ar = DeviceManager.get_device(s.id) for job in ar.capture.get_jobs(): # Start with criteria from the primary table -- this gives us # endtime, duration and filterexpr. bytes_criteria = copy.copy(criteria) bytes_criteria.appresponse_device = s.id bytes_criteria.appresponse_source = 'jobs/' + job.id bytes_criteria.granularity = datetime.timedelta(0, 1) newjob = Job.create(table=ar_query_table, criteria=bytes_criteria) depjobs[newjob.id] = newjob return QueryContinue(self.collect, depjobs)
def analyze(self, jobs=None): download_table = Table.from_ref( self.table.options.related_tables['download_table'] ) # Create source and destination download jobs depjobs = {} c = self.job.criteria sharks = [ ('1-source', c.netshark_device_src, c.netshark_source_name_src), ('2-dest', c.netshark_device_dst, c.netshark_source_name_dst) ] for shark in sharks: sc = copy.copy(c) name, device, source = shark sc.netshark_device = device sc.netshark_source_name = source sc.segment = name job = Job.create(table=download_table, criteria=sc, update_progress=True, parent=self.job) logger.debug("Created %s: %s download job with criteria %s" % (job, name, sc)) depjobs[job.id] = job return QueryContinue(self.collect, depjobs)
def analyze(self, jobs): df = jobs['overall'].data() # First clear all the dynamic columns that were associated with # the table last time the report is run # do not delete the time column for col in self.table.get_columns(): if col.name == 'time': continue col.delete() # Get the top N values of the value column val_col = self.table.options.value_column_name pivot_col = self.table.options.pivot_column_name n = self.table.options.n pivots = list( df.sort_values(val_col, ascending=False).head(n)[pivot_col]) for pivot in pivots: # Add pivot column to the table AppResponseColumn.create(self.table, pivot, pivot) # Create an AppResponseTimeSeries Job self.job.criteria.pivot_column_names = ','.join(pivots) ts_table_ref = self.table.options.related_tables['ts'] table = Table.from_ref(ts_table_ref) job = Job.create(table=table, criteria=self.job.criteria, update_progress=False, parent=self.job) return QueryContinue(self.collect, jobs={'ts': job})
def get(self, request): """ Retrieve records from time series data storage. The URL is formatted as '/db/records?handle=**&start=**&end=**'. Within the URL, required parameters include 'handle' and 'start'. Optional parameter is 'end'. Values for 'start' and 'end' in the URL should be epoch seconds. JSON results returned looks like: [{"avg_bytes": 1617806.0, "time": "2017-03-24T18:14:00+00:00"}, ... ] """ request_data = request.GET.dict() keys = ['handle', 'start'] for k in keys: if k not in request_data: msg = "Missing parameter '{}' in url".format(k) raise InvalidRequest(msg) handle = request_data['handle'] try: obj = ExistingIntervals.objects.get(table_handle=handle) except ObjectDoesNotExist: msg = "Handle '{}' does not exist.".format(handle) raise NotFoundError(msg) tr = {} tr['gte'] = sec_string_to_datetime(int(request_data['start'])) if 'end' in request_data: tr['lte'] = sec_string_to_datetime(int(request_data['end'])) # Getting the time column name table = Table.from_ref( dict(sourcefile=obj.sourcefile, namespace=obj.namespace, name=obj.table)) timecols = [ c for c in table.get_columns(iskey=True) if c.datatype == Column.DATATYPE_TIME ] time_col_name = timecols[0].name col_filters = [ ColumnFilter(query_type='range', query={time_col_name: tr}) ] # allow for override via url param index = request_data.get('index', make_index(obj.namespace)) records = storage.search(index=index, doc_type=handle, col_filters=col_filters) return Response(records)
def collect(self, jobs=None): logger.debug("%s: bizhours.collect: %s" % (self, jobs)) basetable = Table.from_ref( self.table.options.related_tables['basetable'] ) # collect all key names keynames = [] istime = False for key in basetable.get_columns(iskey=True): keynames.append(key.name) if key.istime(): istime = True # Now collect the data total_secs = 0 dfs = [] idx = 0 for jid, job in jobs.iteritems(): if job.status == Job.ERROR: raise AnalysisException("%s for %s-%s failed: %s" % (job, job.criteria.starttime, job.criteria.endtime, job.message)) subdf = job.data() logger.debug("%s: returned %d rows" % (job, len(subdf) if subdf is not None else 0)) if subdf is None: continue logger.debug("%s: actual_criteria %s" % (job, job.actual_criteria)) t0 = job.actual_criteria.starttime t1 = job.actual_criteria.endtime if not istime: subdf['__secs__'] = timedelta_total_seconds(t1 - t0) total_secs += timedelta_total_seconds(t1 - t0) idx += 1 dfs.append(subdf) if len(dfs) == 0: return QueryComplete(None) df = pandas.concat(dfs, ignore_index=True) if not istime: if 'aggregate' in self.table.options: ops = self.table.options['aggregate'] for col in basetable.get_columns(iskey=False): if col.name not in ops: ops[col.name] = 'sum' else: ops = 'sum' df = avg_groupby_aggregate(df, keynames, ops, '__secs__', total_secs) return QueryComplete(df)
def analyze(self, jobs): # Based on input pivot column names, i.e. CIFS, RTP, Facebook # using dataframe keyed by Application ID, and start time # derive dataframe keyed by start_time, with each row as # a dictionary keyed by input pivot values df = jobs['base'].data() # First clear all the dynamic columns that were associated with # the table last time the report is run # do not delete the time column for col in self.table.get_columns(): if col.name == 'time': continue col.delete() base_table = Table.from_ref(self.table.options.tables.base) time_col_name = None for col in base_table.get_columns(): if col.datatype == Column.DATATYPE_TIME and col.iskey: time_col_name = col.name break if not time_col_name: raise AppResponseException("No key 'time' column defined " "in base table") pivot_column = self.table.options.pivot_column_name sub_dfs = [] for pivot in self.job.criteria.pivot_column_names.split(','): # Add pivot column to the table pivot = pivot.strip() AppResponseColumn.create(self.table, pivot, pivot) # Add pivot column to the data frame sub_df = df[df[pivot_column] == pivot] # extract time column and value column sub_df = sub_df[[ time_col_name, self.table.options.value_column_name ]] # Rename columns to 'time' and the pivot column name sub_df.rename(columns={ time_col_name: u'time', self.table.options.value_column_name: pivot }, inplace=True) sub_dfs.append(sub_df) df_final = reduce( lambda df1, df2: pandas.merge(df1, df2, on=u'time', how='outer'), sub_dfs) return QueryComplete(df_final)
def get(self, request): """ Retrieve records from time series data storage. The URL is formatted as '/db/records?handle=**&start=**&end=**'. Within the URL, required parameters include 'handle' and 'start'. Optional parameter is 'end'. Values for 'start' and 'end' in the URL should be epoch seconds. JSON results returned looks like: [{"avg_bytes": 1617806.0, "time": "2017-03-24T18:14:00+00:00"}, ... ] """ request_data = request.GET.dict() keys = ['handle', 'start'] for k in keys: if k not in request_data: msg = "Missing parameter '{}' in url".format(k) raise InvalidRequest(msg) handle = request_data['handle'] try: obj = ExistingIntervals.objects.get(table_handle=handle) except ObjectDoesNotExist: msg = "Handle '{}' does not exist.".format(handle) raise NotFoundError(msg) tr = {} tr['gte'] = sec_string_to_datetime(int(request_data['start'])) if 'end' in request_data: tr['lte'] = sec_string_to_datetime(int(request_data['end'])) # Getting the time column name table = Table.from_ref(dict(sourcefile=obj.sourcefile, namespace=obj.namespace, name=obj.table)) timecols = [c for c in table.get_columns(iskey=True) if c.datatype == Column.DATATYPE_TIME] time_col_name = timecols[0].name col_filters = [ColumnFilter( query_type='range', query={time_col_name: tr})] # allow for override via url param index = request_data.get('index', make_index(obj.namespace)) records = storage.search(index=index, doc_type=handle, col_filters=col_filters) return Response(records)
def post_process_table(self, field_options): if field_options['copy_fields']: keywords = set() for i in ['tables', 'related_tables']: refs = self.options[i] or {} for ref in refs.values(): table = Table.from_ref(ref) for f in table.fields.all(): if f.keyword not in keywords: self.fields.add(f) keywords.add(f.keyword)
def process_options(cls, table_options): # handle direct id's, table references, or table classes # from tables option and transform to simple table id value for i in ['tables', 'related_tables']: for k, v in (table_options[i] or {}).iteritems(): table_options[i][k] = Table.to_ref(v) tf = table_options['function'] if tf and not isinstance(tf, Function): table_options['function'] = Function(tf) return table_options
def __init__(self, *args, **kwargs): super(AnalysisQuery, self).__init__(*args, **kwargs) self.ds_table = Table.from_ref(self.table.options.related_tables['ds']) self.time_col = [ col.name for col in self.ds_table.get_columns() if col.datatype == Column.DATATYPE_TIME ][0] starttime, endtime, self.resolution = self._round_times() self.query_interval = TimeInterval(starttime, endtime) self.handle, self.no_time_criteria = self._calc_handle()
def __init__(self, *args, **kwargs): super(AnalysisQuery, self).__init__(*args, **kwargs) self.ds_table = Table.from_ref(self.table.options.related_tables['ds']) self.time_col = [col.name for col in self.ds_table.get_columns() if col.datatype == Column.DATATYPE_TIME][0] starttime, endtime, self.resolution = self._round_times() self.query_interval = TimeInterval(starttime, endtime) self.handle, self.no_time_criteria = self._calc_handle() logger.debug('TimeSeriesQuery initialized - job: %s, table: %s, ' 'interval: %s, handle: %s' % (self.job, self.table, self.query_interval, self.handle))
def analyze(self, jobs): logger.debug('%s analyze - received jobs: %s' % (self, jobs)) basetable = Table.from_ref( self.table.options['related_tables']['template'] ) data = jobs['source'].data() if data is None: return QueryError('No data available to analyze') # find column whose min/max is largest deviation from mean # then take row from that column where min/max occurs if self.table.options['max']: idx = (data.max() / data.mean()).idxmax() frow = data.ix[data[idx].idxmax()] else: idx = (data.min() / data.mean()).idxmin() frow = data.ix[data[idx].idxmin()] # get time value from extracted row to calculate new start/end times ftime = frow['time'] duration = parse_timedelta(self.table.options['zoom_duration']) resolution = parse_timedelta(self.table.options['zoom_resolution']) stime = ftime - (duration / 2) etime = ftime + (duration / 2) criteria = self.job.criteria if 'resolution' in criteria: criteria['resolution'] = resolution else: criteria['granularity'] = resolution criteria['duration'] = duration criteria['_orig_duration'] = duration criteria['starttime'] = stime criteria['_orig_starttime'] = stime criteria['endtime'] = etime criteria['_orig_endtime'] = etime logging.debug('Creating FocusedAnalysis job with updated criteria %s' % criteria) job = Job.create(basetable, criteria, self.job.update_progress) return QueryContinue(self.finish, {'job': job})
def run(self): # Collect all dependent tables tables = self.table.options.tables if not tables: return QueryContinue(self._analyze, {}) logger.debug("%s: dependent tables: %s" % (self, tables)) jobs = {} for (name, ref) in tables.items(): table = Table.from_ref(ref) job = Job.create(table, self.job.criteria, update_progress=self.job.update_progress, parent=self.job) logger.debug("%s: dependent job %s" % (self, job)) jobs[name] = job return QueryContinue(self._analyze, jobs)
def analyze(self, jobs): tag = Tag.objects.get(id=self.job.criteria.tag).name cmd_table = Table.from_ref( self.table.options.related_tables['base']) dep_jobs = {} for sh_db in Device.objects.filter_by_tag(tag, module='steelhead', enabled=True): criteria = copy.copy(self.job.criteria) criteria.dev = sh_db job = Job.create(table=cmd_table, criteria=criteria, parent=self.job) dep_jobs[job.id] = job if not dep_jobs: return QueryError("No enabled steelhead " "devices found with tag '{}'".format(tag)) return QueryContinue(self.collect, jobs=dep_jobs)
def analyze(self, jobs=None): criteria = self.job.criteria if jobs: job = list(jobs.values())[0] if job.status == Job.ERROR: raise AnalysisException("%s for getting pcap file failed: %s" % (job, job.message)) criteria.entire_pcap = True self.filename = job.data()['filename'][0] else: self.filename = criteria.pcapfilename pcap = PcapFile(self.filename) try: pcap_info = pcap.info() except ValueError: raise AnalysisException("No packets in %s" % self.filename) logger.debug("%s: File info %s" % (self.__class__.__name__, pcap_info)) self.pkt_num = int(pcap_info['Number of packets']) min_pkt_num = self.table.options.split_threshold wt = Table.from_ref(self.table.options.related_tables['wireshark']) depjobs = {} if self.pkt_num < min_pkt_num: # No need to split the pcap file criteria.pcapfilename = self.filename criteria.entire_pcap = True job = Job.create(table=wt, criteria=criteria, update_progress=False, parent=self.job) depjobs[job.id] = job logger.debug("%s starting single job" % self.__class__.__name__) return QueryContinue(self.collect, depjobs) self.output_dir = os.path.join(SPLIT_DIR, self.file_handle) self.split_pcap() split_files = os.listdir(self.output_dir) if not split_files: raise AnalysisException('No pcap file found after splitting %s' % self.filename) for split in split_files: # use wireshark table ws_criteria = copy.copy(criteria) ws_criteria.pcapfilename = os.path.join(self.output_dir, split) # for ease of removing the split directory in collect func ws_criteria.output_dir = self.output_dir job = Job.create(table=wt, criteria=ws_criteria, update_progress=False, parent=self.job) depjobs[job.id] = job logger.debug("%s starting multiple jobs" % self.__class__.__name__) return QueryContinue(self.collect, jobs=depjobs)
def process_options(cls, table_options): # handle direct id's, table references, or table classes # from tables option and transform to simple table id value table_options['base'] = Table.to_ref(table_options['base']) return table_options
def run(self): args = self._prepare_report_args() base_table = Table.from_ref(self.table.options.base) base_col = base_table.get_columns()[0] # only calculate other when we aren't filtering data include_other = self.table.options.include_other if self.job.criteria.netprofiler_filterexpr: include_other = False if self.table.options.groupby not in self.CONFIG: raise ValueError('not supported for groupby=%s' % self.table.options.groupby) config = self.CONFIG[self.table.options.groupby] # num_reports / cur_report are used to compute min/max pct num_reports = (1 + (1 if self.table.options.top_n else 0) + (1 if include_other else 0)) cur_report = 0 if self.table.options.top_n: # Run a top-n report to drive the criteria for each column query_column_defs = self.run_top_n(config, args, base_col, minpct=0, maxpct=(100/num_reports)) cur_report += 1 else: query_column_defs = self.job.criteria.query_columns if isinstance(query_column_defs, types.StringTypes): query_column_defs = json.loads(query_column_defs) query_columns = [col['json'] for col in query_column_defs] if not query_columns: msg = 'Unable to compute query colums for job %s' % self.job logger.error(msg) return QueryError(msg) with lock: report = TrafficTimeSeriesReport(args.profiler) columns = [args.columns[0], base_col.name] logger.info("Query Columns: %s" % str(query_columns)) if self.table.options.groupby == 'host_group': host_group_type = 'ByLocation' else: host_group_type = None report.run( centricity=args.centricity, columns=columns, timefilter=args.timefilter, trafficexpr=args.trafficexpr, resolution=args.resolution, sync=False, host_group_type=host_group_type, query_columns_groupby=config.groupby, query_columns=query_columns ) data = self._wait_for_data(report, minpct=cur_report * (100/num_reports), maxpct=(cur_report + 1) * (100/num_reports)) cur_report += 1 df = pandas.DataFrame(data, columns=(['time'] + [col['name'] for col in query_column_defs])) # Create ephemeral columns for all the data based # on the related base table for col in query_column_defs: Column.create(self.job.table, col['name'], col['label'], ephemeral=self.job, datatype=base_col.datatype, formatter=base_col.formatter) if include_other: # Run a separate timeseries query with no column filters # to get "totals" then use that to compute an "other" column with lock: report = SingleQueryReport(args.profiler) report.run( realm='traffic_overall_time_series', centricity=args.centricity, groupby=args.profiler.groupbys['time'], columns=columns, timefilter=args.timefilter, trafficexpr=args.trafficexpr, resolution=args.resolution, sync=False ) totals = self._wait_for_data(report, minpct=cur_report * (100/num_reports), maxpct=(cur_report + 1) * (100/num_reports)) df = df.set_index('time') df['subtotal'] = df.sum(axis=1) totals_df = (pandas.DataFrame(totals, columns=['time', 'total']) .set_index('time')) df = df.merge(totals_df, left_index=True, right_index=True) df['other'] = df['total'] = df['subtotal'] colnames = ['time'] + [col['name'] for col in query_column_defs] + ['other'] # Drop the extraneous total and subtotal columns df = (df.reset_index().ix[:, colnames]) Column.create(self.job.table, 'other', 'Other', ephemeral=self.job, datatype=base_col.datatype, formatter=base_col.formatter) logger.info("Report %s returned %s rows" % (self.job, len(df))) return QueryComplete(df)
def run(self): args = self._prepare_report_args() base_table = Table.from_ref(self.table.options.base) base_col = base_table.get_columns()[0] # only calculate other when we aren't filtering data include_other = self.table.options.include_other if self.job.criteria.netprofiler_filterexpr: include_other = False if self.table.options.groupby not in self.CONFIG: raise ValueError('not supported for groupby=%s' % self.table.options.groupby) config = self.CONFIG[self.table.options.groupby] # num_reports / cur_report are used to compute min/max pct num_reports = (1 + (1 if self.table.options.top_n else 0) + (1 if include_other else 0)) cur_report = 0 if self.table.options.top_n: # Run a top-n report to drive the criteria for each column query_column_defs = self.run_top_n(config, args, base_col, minpct=0, maxpct=(100/num_reports)) cur_report += 1 else: query_column_defs = self.job.criteria.query_columns if isinstance(query_column_defs, types.StringTypes): query_column_defs = json.loads(query_column_defs) query_columns = [col['json'] for col in query_column_defs] with lock: report = TrafficTimeSeriesReport(args.profiler) columns = [args.columns[0], base_col.name] logger.info("Query Columns: %s" % str(query_columns)) if self.table.options.groupby == 'host_group': host_group_type = 'ByLocation' else: host_group_type = None report.run( centricity=args.centricity, columns=columns, timefilter=args.timefilter, trafficexpr=args.trafficexpr, resolution=args.resolution, sync=False, host_group_type=host_group_type, query_columns_groupby=config.groupby, query_columns=query_columns ) data = self._wait_for_data(report, minpct=cur_report * (100/num_reports), maxpct=(cur_report + 1) * (100/num_reports)) cur_report += 1 df = pandas.DataFrame(data, columns=(['time'] + [col['name'] for col in query_column_defs])) # Create ephemeral columns for all the data based # on the related base table for col in query_column_defs: Column.create(self.job.table, col['name'], col['label'], ephemeral=self.job, datatype=base_col.datatype, formatter=base_col.formatter) if include_other: # Run a separate timeseries query with no column filters # to get "totals" then use that to compute an "other" column with lock: report = SingleQueryReport(args.profiler) report.run( realm='traffic_overall_time_series', groupby=args.profiler.groupbys['time'], columns=columns, timefilter=args.timefilter, trafficexpr=args.trafficexpr, resolution=args.resolution, sync=False ) totals = self._wait_for_data(report, minpct=cur_report * (100/num_reports), maxpct=(cur_report + 1) * (100/num_reports)) df = df.set_index('time') df['subtotal'] = df.sum(axis=1) totals_df = (pandas.DataFrame(totals, columns=['time', 'total']) .set_index('time')) df = df.merge(totals_df, left_index=True, right_index=True) df['other'] = df['total'] = df['subtotal'] colnames = ['time'] + [col['name'] for col in query_column_defs] + ['other'] # Drop the extraneous total and subtotal columns df = (df.reset_index().ix[:, colnames]) Column.create(self.job.table, 'other', 'Other', ephemeral=self.job, datatype=base_col.datatype, formatter=base_col.formatter) logger.info("Report %s returned %s rows" % (self.job, len(df))) return QueryComplete(df)
def get_timestable(biztable): return Table.from_ref(biztable.options.tables['times'])