Beispiel #1
0
            def del_table(tbl):
                related_tables = ((tbl.options or {}).get('related_tables'))
                for ref in (related_tables or {}).values():
                    try:
                        del_table(Table.from_ref(ref))
                    except ObjectDoesNotExist:
                        # already deleted
                        pass

                Column.objects.filter(table=tbl.id).delete()
                Job.objects.filter(table=tbl.id).delete()

                for trigger in TriggerCache.filter(tbl):
                    trigger.delete()

                for handler in ErrorHandlerCache.filter(tbl):
                    handler.delete()

                # delete newly unreferenced routes
                Destination.objects.filter(trigger=None).delete()

                tables = (tbl.options or {}).get('tables')
                for ref in (tables or {}).values():
                    try:
                        del_table(Table.from_ref(ref))
                    except ObjectDoesNotExist:
                        # already deleted
                        pass

                tbl.delete()
            def del_table(tbl):
                related_tables = ((tbl.options or {}).get('related_tables'))
                for ref in (related_tables or {}).values():
                    try:
                        del_table(Table.from_ref(ref))
                    except ObjectDoesNotExist:
                        # already deleted
                        pass

                Column.objects.filter(table=tbl.id).delete()
                Job.objects.filter(table=tbl.id).delete()

                for trigger in TriggerCache.filter(tbl):
                    trigger.delete()

                for handler in ErrorHandlerCache.filter(tbl):
                    handler.delete()

                # delete newly unreferenced routes
                Destination.objects.filter(trigger=None).delete()

                tables = (tbl.options or {}).get('tables')
                for ref in (tables or {}).values():
                    try:
                        del_table(Table.from_ref(ref))
                    except ObjectDoesNotExist:
                        # already deleted
                        pass

                tbl.delete()
    def analyze(self, jobs):
        criteria = self.job.criteria

        sharks_query_table = Table.from_ref(
            self.table.options.related_tables['basetable'])

        depjobs = {}

        # For every (shark, job), we spin off a new job to grab the data, then
        # merge everything into one dataframe at the end.
        for s in Device.objects.filter(module='netshark', enabled=True):
            shark = DeviceManager.get_device(s.id)

            for capjob in shark.get_capture_jobs():
                # Start with criteria from the primary table -- this gives us
                # endtime, duration and netshark_filterexpr.
                bytes_criteria = copy.copy(criteria)
                bytes_criteria.netshark_device = s.id
                bytes_criteria.netshark_source_name = 'jobs/' + capjob.name
                bytes_criteria.resolution = datetime.timedelta(0, 1)
                bytes_criteria.aggregated = True

                job = Job.create(table=sharks_query_table,
                                 criteria=bytes_criteria)

                depjobs[job.id] = job

        return QueryContinue(self.collect, depjobs)
    def analyze(self, jobs=None):

        download_table = Table.from_ref(
            self.table.options.related_tables['download_table'])

        # Create source and destination download jobs
        depjobs = {}

        c = self.job.criteria
        sharks = [
            ('1-source', c.netshark_device_src, c.netshark_source_name_src),
            ('2-dest', c.netshark_device_dst, c.netshark_source_name_dst)
        ]

        for shark in sharks:
            sc = copy.copy(c)
            name, device, source = shark
            sc.netshark_device = device
            sc.netshark_source_name = source
            sc.segment = name

            job = Job.create(table=download_table,
                             criteria=sc,
                             update_progress=True,
                             parent=self.job)
            logger.debug("Created %s: %s download job with criteria %s" %
                         (job, name, sc))
            depjobs[job.id] = job

        return QueryContinue(self.collect, depjobs)
Beispiel #5
0
    def analyze(self, jobs):
        criteria = self.job.criteria

        ar_query_table = Table.from_ref(
            self.table.options.related_tables['basetable']
        )

        depjobs = {}

        # For every (ar, job), we spin off a new job to grab the data, then
        # merge everything into one dataframe at the end.
        for s in Device.objects.filter(module='appresponse', enabled=True):
            ar = DeviceManager.get_device(s.id)

            for job in ar.capture.get_jobs():
                # Start with criteria from the primary table -- this gives us
                # endtime, duration and filterexpr.
                bytes_criteria = copy.copy(criteria)
                bytes_criteria.appresponse_device = s.id
                bytes_criteria.appresponse_source = 'jobs/' + job.id
                bytes_criteria.granularity = datetime.timedelta(0, 1)

                newjob = Job.create(table=ar_query_table,
                                    criteria=bytes_criteria)

                depjobs[newjob.id] = newjob

        return QueryContinue(self.collect, depjobs)
    def analyze(self, jobs):
        criteria = self.job.criteria

        sharks_query_table = Table.from_ref(
            self.table.options.related_tables['basetable'])

        depjobs = {}

        # For every (shark, job), we spin off a new job to grab the data, then
        # merge everything into one dataframe at the end.
        for s in Device.objects.filter(module='netshark', enabled=True):
            shark = DeviceManager.get_device(s.id)

            for capjob in shark.get_capture_jobs():
                # Start with criteria from the primary table -- this gives us
                # endtime, duration and netshark_filterexpr.
                bytes_criteria = copy.copy(criteria)
                bytes_criteria.netshark_device = s.id
                bytes_criteria.netshark_source_name = 'jobs/' + capjob.name
                bytes_criteria.resolution = datetime.timedelta(0, 1)
                bytes_criteria.aggregated = True

                job = Job.create(table=sharks_query_table,
                                 criteria=bytes_criteria)

                depjobs[job.id] = job

        return QueryContinue(self.collect, depjobs)
    def analyze(self, jobs=None):

        download_table = Table.from_ref(
            self.table.options.related_tables['download_table']
        )

        # Create source and destination download jobs
        depjobs = {}

        c = self.job.criteria
        sharks = [
            ('1-source', c.netshark_device_src, c.netshark_source_name_src),
            ('2-dest', c.netshark_device_dst, c.netshark_source_name_dst)
        ]

        for shark in sharks:
            sc = copy.copy(c)
            name, device, source = shark
            sc.netshark_device = device
            sc.netshark_source_name = source
            sc.segment = name

            job = Job.create(table=download_table, criteria=sc,
                             update_progress=True, parent=self.job)
            logger.debug("Created %s: %s download job with criteria %s"
                         % (job, name, sc))
            depjobs[job.id] = job

        return QueryContinue(self.collect, depjobs)
    def analyze(self, jobs):
        criteria = self.job.criteria

        tzname = criteria.business_hours_tzname
        tz = pytz.timezone(tzname)

        times = jobs['times'].data()

        if times is None or len(times) == 0:
            return QueryComplete(None)

        basetable = Table.from_ref(
            self.table.options.related_tables['basetable']
        )

        # Create all the jobs
        depjobs = {}

        for i, row in times.iterrows():
            (t0, t1) = (row['starttime'], row['endtime'])
            sub_criteria = copy.copy(criteria)
            sub_criteria.starttime = t0.astimezone(tz)
            sub_criteria.endtime = t1.astimezone(tz)

            job = Job.create(table=basetable, criteria=sub_criteria,
                             update_progress=False, parent=self.job)

            logger.debug("Created %s: %s - %s" % (job, t0, t1))
            depjobs[job.id] = job

        return QueryContinue(self.collect, depjobs)
Beispiel #9
0
    def analyze(self, jobs):

        df = jobs['overall'].data()

        # First clear all the dynamic columns that were associated with
        # the table last time the report is run
        # do not delete the time column
        for col in self.table.get_columns():
            if col.name == 'time':
                continue
            col.delete()

        # Get the top N values of the value column
        val_col = self.table.options.value_column_name
        pivot_col = self.table.options.pivot_column_name
        n = self.table.options.n

        pivots = list(
            df.sort_values(val_col, ascending=False).head(n)[pivot_col])

        for pivot in pivots:
            # Add pivot column to the table
            AppResponseColumn.create(self.table, pivot, pivot)

        # Create an AppResponseTimeSeries Job
        self.job.criteria.pivot_column_names = ','.join(pivots)
        ts_table_ref = self.table.options.related_tables['ts']
        table = Table.from_ref(ts_table_ref)

        job = Job.create(table=table,
                         criteria=self.job.criteria,
                         update_progress=False,
                         parent=self.job)

        return QueryContinue(self.collect, jobs={'ts': job})
Beispiel #10
0
    def get(self, request):
        """ Retrieve records from time series data storage.

        The URL is formatted as '/db/records?handle=**&start=**&end=**'.
        Within the URL, required parameters include 'handle' and 'start'.
        Optional parameter is 'end'. Values for 'start' and 'end' in the
        URL should be epoch seconds. JSON results returned looks like:

        [{"avg_bytes": 1617806.0,
          "time": "2017-03-24T18:14:00+00:00"},
          ...
        ]
        """

        request_data = request.GET.dict()

        keys = ['handle', 'start']
        for k in keys:
            if k not in request_data:
                msg = "Missing parameter '{}' in url".format(k)
                raise InvalidRequest(msg)

        handle = request_data['handle']
        try:
            obj = ExistingIntervals.objects.get(table_handle=handle)
        except ObjectDoesNotExist:
            msg = "Handle '{}' does not exist.".format(handle)
            raise NotFoundError(msg)

        tr = {}
        tr['gte'] = sec_string_to_datetime(int(request_data['start']))
        if 'end' in request_data:
            tr['lte'] = sec_string_to_datetime(int(request_data['end']))

        # Getting the time column name
        table = Table.from_ref(
            dict(sourcefile=obj.sourcefile,
                 namespace=obj.namespace,
                 name=obj.table))

        timecols = [
            c for c in table.get_columns(iskey=True)
            if c.datatype == Column.DATATYPE_TIME
        ]

        time_col_name = timecols[0].name

        col_filters = [
            ColumnFilter(query_type='range', query={time_col_name: tr})
        ]

        # allow for override via url param
        index = request_data.get('index', make_index(obj.namespace))

        records = storage.search(index=index,
                                 doc_type=handle,
                                 col_filters=col_filters)

        return Response(records)
    def collect(self, jobs=None):
        logger.debug("%s: bizhours.collect: %s" % (self, jobs))
        basetable = Table.from_ref(
            self.table.options.related_tables['basetable']
        )

        # collect all key names
        keynames = []
        istime = False
        for key in basetable.get_columns(iskey=True):
            keynames.append(key.name)
            if key.istime():
                istime = True

        # Now collect the data
        total_secs = 0
        dfs = []
        idx = 0
        for jid, job in jobs.iteritems():
            if job.status == Job.ERROR:
                raise AnalysisException("%s for %s-%s failed: %s" %
                                        (job, job.criteria.starttime,
                                         job.criteria.endtime,
                                         job.message))
            subdf = job.data()
            logger.debug("%s: returned %d rows" %
                         (job, len(subdf) if subdf is not None else 0))
            if subdf is None:
                continue

            logger.debug("%s: actual_criteria %s" % (job, job.actual_criteria))
            t0 = job.actual_criteria.starttime
            t1 = job.actual_criteria.endtime
            if not istime:
                subdf['__secs__'] = timedelta_total_seconds(t1 - t0)
            total_secs += timedelta_total_seconds(t1 - t0)
            idx += 1
            dfs.append(subdf)

        if len(dfs) == 0:
            return QueryComplete(None)

        df = pandas.concat(dfs, ignore_index=True)
        if not istime:
            if 'aggregate' in self.table.options:
                ops = self.table.options['aggregate']
                for col in basetable.get_columns(iskey=False):
                    if col.name not in ops:
                        ops[col.name] = 'sum'

            else:
                ops = 'sum'

            df = avg_groupby_aggregate(df, keynames, ops,
                                       '__secs__', total_secs)

        return QueryComplete(df)
Beispiel #12
0
    def analyze(self, jobs):
        # Based on input pivot column names, i.e. CIFS, RTP, Facebook
        # using dataframe keyed by Application ID, and start time
        # derive dataframe keyed by start_time, with each row as
        # a dictionary keyed by input pivot values

        df = jobs['base'].data()
        # First clear all the dynamic columns that were associated with
        # the table last time the report is run
        # do not delete the time column
        for col in self.table.get_columns():
            if col.name == 'time':
                continue
            col.delete()

        base_table = Table.from_ref(self.table.options.tables.base)

        time_col_name = None
        for col in base_table.get_columns():
            if col.datatype == Column.DATATYPE_TIME and col.iskey:
                time_col_name = col.name
                break

        if not time_col_name:
            raise AppResponseException("No key 'time' column defined "
                                       "in base table")

        pivot_column = self.table.options.pivot_column_name

        sub_dfs = []
        for pivot in self.job.criteria.pivot_column_names.split(','):
            # Add pivot column to the table
            pivot = pivot.strip()
            AppResponseColumn.create(self.table, pivot, pivot)

            # Add pivot column to the data frame
            sub_df = df[df[pivot_column] == pivot]

            # extract time column and value column
            sub_df = sub_df[[
                time_col_name, self.table.options.value_column_name
            ]]
            # Rename columns to 'time' and the pivot column name
            sub_df.rename(columns={
                time_col_name: u'time',
                self.table.options.value_column_name: pivot
            },
                          inplace=True)

            sub_dfs.append(sub_df)

        df_final = reduce(
            lambda df1, df2: pandas.merge(df1, df2, on=u'time', how='outer'),
            sub_dfs)

        return QueryComplete(df_final)
Beispiel #13
0
    def get(self, request):
        """ Retrieve records from time series data storage.

        The URL is formatted as '/db/records?handle=**&start=**&end=**'.
        Within the URL, required parameters include 'handle' and 'start'.
        Optional parameter is 'end'. Values for 'start' and 'end' in the
        URL should be epoch seconds. JSON results returned looks like:

        [{"avg_bytes": 1617806.0,
          "time": "2017-03-24T18:14:00+00:00"},
          ...
        ]
        """

        request_data = request.GET.dict()

        keys = ['handle', 'start']
        for k in keys:
            if k not in request_data:
                msg = "Missing parameter '{}' in url".format(k)
                raise InvalidRequest(msg)

        handle = request_data['handle']
        try:
            obj = ExistingIntervals.objects.get(table_handle=handle)
        except ObjectDoesNotExist:
            msg = "Handle '{}' does not exist.".format(handle)
            raise NotFoundError(msg)

        tr = {}
        tr['gte'] = sec_string_to_datetime(int(request_data['start']))
        if 'end' in request_data:
            tr['lte'] = sec_string_to_datetime(int(request_data['end']))

        # Getting the time column name
        table = Table.from_ref(dict(sourcefile=obj.sourcefile,
                                    namespace=obj.namespace,
                                    name=obj.table))

        timecols = [c for c in table.get_columns(iskey=True)
                    if c.datatype == Column.DATATYPE_TIME]

        time_col_name = timecols[0].name

        col_filters = [ColumnFilter(
                       query_type='range',
                       query={time_col_name: tr})]

        # allow for override via url param
        index = request_data.get('index', make_index(obj.namespace))

        records = storage.search(index=index,
                                 doc_type=handle,
                                 col_filters=col_filters)

        return Response(records)
 def post_process_table(self, field_options):
     if field_options['copy_fields']:
         keywords = set()
         for i in ['tables', 'related_tables']:
             refs = self.options[i] or {}
             for ref in refs.values():
                 table = Table.from_ref(ref)
                 for f in table.fields.all():
                     if f.keyword not in keywords:
                         self.fields.add(f)
                         keywords.add(f.keyword)
Beispiel #15
0
 def post_process_table(self, field_options):
     if field_options['copy_fields']:
         keywords = set()
         for i in ['tables', 'related_tables']:
             refs = self.options[i] or {}
             for ref in refs.values():
                 table = Table.from_ref(ref)
                 for f in table.fields.all():
                     if f.keyword not in keywords:
                         self.fields.add(f)
                         keywords.add(f.keyword)
Beispiel #16
0
    def __init__(self, *args, **kwargs):
        super(AnalysisQuery, self).__init__(*args, **kwargs)

        self.ds_table = Table.from_ref(self.table.options.related_tables['ds'])

        self.time_col = [
            col.name for col in self.ds_table.get_columns()
            if col.datatype == Column.DATATYPE_TIME
        ][0]

        starttime, endtime, self.resolution = self._round_times()

        self.query_interval = TimeInterval(starttime, endtime)

        self.handle, self.no_time_criteria = self._calc_handle()
Beispiel #17
0
    def __init__(self, *args, **kwargs):
        super(AnalysisQuery, self).__init__(*args, **kwargs)

        self.ds_table = Table.from_ref(self.table.options.related_tables['ds'])

        self.time_col = [col.name for col in self.ds_table.get_columns()
                         if col.datatype == Column.DATATYPE_TIME][0]

        starttime, endtime, self.resolution = self._round_times()
        self.query_interval = TimeInterval(starttime, endtime)

        self.handle, self.no_time_criteria = self._calc_handle()

        logger.debug('TimeSeriesQuery initialized - job: %s, table: %s, '
                     'interval: %s, handle: %s' %
                     (self.job, self.table, self.query_interval, self.handle))
Beispiel #18
0
    def analyze(self, jobs):
        logger.debug('%s analyze - received jobs: %s' % (self, jobs))

        basetable = Table.from_ref(
            self.table.options['related_tables']['template']
        )
        data = jobs['source'].data()
        if data is None:
            return QueryError('No data available to analyze')

        # find column whose min/max is largest deviation from mean
        # then take row from that column where min/max occurs
        if self.table.options['max']:
            idx = (data.max() / data.mean()).idxmax()
            frow = data.ix[data[idx].idxmax()]
        else:
            idx = (data.min() / data.mean()).idxmin()
            frow = data.ix[data[idx].idxmin()]

        # get time value from extracted row to calculate new start/end times
        ftime = frow['time']
        duration = parse_timedelta(self.table.options['zoom_duration'])
        resolution = parse_timedelta(self.table.options['zoom_resolution'])
        stime = ftime - (duration / 2)
        etime = ftime + (duration / 2)

        criteria = self.job.criteria

        if 'resolution' in criteria:
            criteria['resolution'] = resolution
        else:
            criteria['granularity'] = resolution

        criteria['duration'] = duration
        criteria['_orig_duration'] = duration
        criteria['starttime'] = stime
        criteria['_orig_starttime'] = stime
        criteria['endtime'] = etime
        criteria['_orig_endtime'] = etime

        logging.debug('Creating FocusedAnalysis job with updated criteria %s'
                      % criteria)

        job = Job.create(basetable, criteria, self.job.update_progress)
        return QueryContinue(self.finish, {'job': job})
Beispiel #19
0
    def run(self):
        # Collect all dependent tables
        tables = self.table.options.tables
        if not tables:
            return QueryContinue(self._analyze, {})

        logger.debug("%s: dependent tables: %s" % (self, tables))
        jobs = {}

        for (name, ref) in tables.items():
            table = Table.from_ref(ref)
            job = Job.create(table, self.job.criteria,
                             update_progress=self.job.update_progress,
                             parent=self.job)

            logger.debug("%s: dependent job %s" % (self, job))
            jobs[name] = job

        return QueryContinue(self._analyze, jobs)
    def run(self):
        # Collect all dependent tables
        tables = self.table.options.tables
        if not tables:
            return QueryContinue(self._analyze, {})

        logger.debug("%s: dependent tables: %s" % (self, tables))
        jobs = {}

        for (name, ref) in tables.items():
            table = Table.from_ref(ref)
            job = Job.create(table, self.job.criteria,
                             update_progress=self.job.update_progress,
                             parent=self.job)

            logger.debug("%s: dependent job %s" % (self, job))
            jobs[name] = job

        return QueryContinue(self._analyze, jobs)
    def analyze(self, jobs):

        tag = Tag.objects.get(id=self.job.criteria.tag).name

        cmd_table = Table.from_ref(
            self.table.options.related_tables['base'])

        dep_jobs = {}

        for sh_db in Device.objects.filter_by_tag(tag, module='steelhead',
                                                  enabled=True):
            criteria = copy.copy(self.job.criteria)
            criteria.dev = sh_db
            job = Job.create(table=cmd_table, criteria=criteria,
                             parent=self.job)
            dep_jobs[job.id] = job

        if not dep_jobs:
            return QueryError("No enabled steelhead "
                              "devices found with tag '{}'".format(tag))

        return QueryContinue(self.collect, jobs=dep_jobs)
    def analyze(self, jobs=None):

        criteria = self.job.criteria

        if jobs:
            job = list(jobs.values())[0]
            if job.status == Job.ERROR:
                raise AnalysisException("%s for getting pcap file failed: %s"
                                        % (job, job.message))
            criteria.entire_pcap = True
            self.filename = job.data()['filename'][0]
        else:
            self.filename = criteria.pcapfilename

        pcap = PcapFile(self.filename)

        try:
            pcap_info = pcap.info()
        except ValueError:
            raise AnalysisException("No packets in %s" % self.filename)

        logger.debug("%s: File info %s" % (self.__class__.__name__, pcap_info))

        self.pkt_num = int(pcap_info['Number of packets'])

        min_pkt_num = self.table.options.split_threshold

        wt = Table.from_ref(self.table.options.related_tables['wireshark'])

        depjobs = {}
        if self.pkt_num < min_pkt_num:
            # No need to split the pcap file
            criteria.pcapfilename = self.filename
            criteria.entire_pcap = True
            job = Job.create(table=wt, criteria=criteria,
                             update_progress=False, parent=self.job)

            depjobs[job.id] = job

            logger.debug("%s starting single job" % self.__class__.__name__)
            return QueryContinue(self.collect, depjobs)

        self.output_dir = os.path.join(SPLIT_DIR, self.file_handle)
        self.split_pcap()

        split_files = os.listdir(self.output_dir)

        if not split_files:
            raise AnalysisException('No pcap file found after splitting %s'
                                    % self.filename)

        for split in split_files:
            # use wireshark table
            ws_criteria = copy.copy(criteria)
            ws_criteria.pcapfilename = os.path.join(self.output_dir, split)

            # for ease of removing the split directory in collect func
            ws_criteria.output_dir = self.output_dir

            job = Job.create(table=wt, criteria=ws_criteria,
                             update_progress=False, parent=self.job)

            depjobs[job.id] = job

        logger.debug("%s starting multiple jobs" % self.__class__.__name__)

        return QueryContinue(self.collect, jobs=depjobs)
    def run(self):
        args = self._prepare_report_args()
        base_table = Table.from_ref(self.table.options.base)
        base_col = base_table.get_columns()[0]

        # only calculate other when we aren't filtering data
        include_other = self.table.options.include_other
        if self.job.criteria.netprofiler_filterexpr:
            include_other = False

        if self.table.options.groupby not in self.CONFIG:
            raise ValueError('not supported for groupby=%s' %
                             self.table.options.groupby)

        config = self.CONFIG[self.table.options.groupby]

        # num_reports / cur_report are used to compute min/max pct
        num_reports = (1 +
                       (1 if self.table.options.top_n else 0) +
                       (1 if include_other else 0))
        cur_report = 0

        if self.table.options.top_n:
            # Run a top-n report to drive the criteria for each column
            query_column_defs = self.run_top_n(config, args, base_col,
                                               minpct=0,
                                               maxpct=(100/num_reports))
            cur_report += 1
        else:
            query_column_defs = self.job.criteria.query_columns
            if isinstance(query_column_defs, types.StringTypes):
                query_column_defs = json.loads(query_column_defs)

        query_columns = [col['json'] for col in query_column_defs]

        with lock:
            report = TrafficTimeSeriesReport(args.profiler)
            columns = [args.columns[0], base_col.name]
            logger.info("Query Columns: %s" % str(query_columns))

            if self.table.options.groupby == 'host_group':
                host_group_type = 'ByLocation'
            else:
                host_group_type = None

            report.run(
                centricity=args.centricity,
                columns=columns,
                timefilter=args.timefilter,
                trafficexpr=args.trafficexpr,
                resolution=args.resolution,
                sync=False,
                host_group_type=host_group_type,
                query_columns_groupby=config.groupby,
                query_columns=query_columns
            )

        data = self._wait_for_data(report,
                                   minpct=cur_report * (100/num_reports),
                                   maxpct=(cur_report + 1) * (100/num_reports))
        cur_report += 1

        df = pandas.DataFrame(data,
                              columns=(['time'] + [col['name'] for
                                                   col in query_column_defs]))

        # Create ephemeral columns for all the data based
        # on the related base table
        for col in query_column_defs:
            Column.create(self.job.table, col['name'], col['label'],
                          ephemeral=self.job, datatype=base_col.datatype,
                          formatter=base_col.formatter)

        if include_other:
            # Run a separate timeseries query with no column filters
            # to get "totals" then use that to compute an "other" column

            with lock:
                report = SingleQueryReport(args.profiler)
                report.run(
                    realm='traffic_overall_time_series',
                    groupby=args.profiler.groupbys['time'],
                    columns=columns,
                    timefilter=args.timefilter,
                    trafficexpr=args.trafficexpr,
                    resolution=args.resolution,
                    sync=False
                )

            totals = self._wait_for_data(report,
                                         minpct=cur_report * (100/num_reports),
                                         maxpct=(cur_report + 1) * (100/num_reports))

            df = df.set_index('time')
            df['subtotal'] = df.sum(axis=1)
            totals_df = (pandas.DataFrame(totals, columns=['time', 'total'])
                         .set_index('time'))

            df = df.merge(totals_df, left_index=True, right_index=True)
            df['other'] = df['total'] = df['subtotal']
            colnames = ['time'] + [col['name'] for col in query_column_defs] + ['other']

            # Drop the extraneous total and subtotal columns
            df = (df.reset_index().ix[:, colnames])

            Column.create(self.job.table, 'other', 'Other',
                          ephemeral=self.job, datatype=base_col.datatype,
                          formatter=base_col.formatter)

        logger.info("Report %s returned %s rows" % (self.job, len(df)))
        return QueryComplete(df)
    def analyze(self, jobs=None):

        criteria = self.job.criteria

        if jobs:
            job = list(jobs.values())[0]
            if job.status == Job.ERROR:
                raise AnalysisException("%s for getting pcap file failed: %s" %
                                        (job, job.message))
            criteria.entire_pcap = True
            self.filename = job.data()['filename'][0]
        else:
            self.filename = criteria.pcapfilename

        pcap = PcapFile(self.filename)

        try:
            pcap_info = pcap.info()
        except ValueError:
            raise AnalysisException("No packets in %s" % self.filename)

        logger.debug("%s: File info %s" % (self.__class__.__name__, pcap_info))

        self.pkt_num = int(pcap_info['Number of packets'])

        min_pkt_num = self.table.options.split_threshold

        wt = Table.from_ref(self.table.options.related_tables['wireshark'])

        depjobs = {}
        if self.pkt_num < min_pkt_num:
            # No need to split the pcap file
            criteria.pcapfilename = self.filename
            criteria.entire_pcap = True
            job = Job.create(table=wt,
                             criteria=criteria,
                             update_progress=False,
                             parent=self.job)

            depjobs[job.id] = job

            logger.debug("%s starting single job" % self.__class__.__name__)
            return QueryContinue(self.collect, depjobs)

        self.output_dir = os.path.join(SPLIT_DIR, self.file_handle)
        self.split_pcap()

        split_files = os.listdir(self.output_dir)

        if not split_files:
            raise AnalysisException('No pcap file found after splitting %s' %
                                    self.filename)

        for split in split_files:
            # use wireshark table
            ws_criteria = copy.copy(criteria)
            ws_criteria.pcapfilename = os.path.join(self.output_dir, split)

            # for ease of removing the split directory in collect func
            ws_criteria.output_dir = self.output_dir

            job = Job.create(table=wt,
                             criteria=ws_criteria,
                             update_progress=False,
                             parent=self.job)

            depjobs[job.id] = job

        logger.debug("%s starting multiple jobs" % self.__class__.__name__)

        return QueryContinue(self.collect, jobs=depjobs)
Beispiel #25
0
    def run(self):
        args = self._prepare_report_args()
        base_table = Table.from_ref(self.table.options.base)
        base_col = base_table.get_columns()[0]

        # only calculate other when we aren't filtering data
        include_other = self.table.options.include_other
        if self.job.criteria.netprofiler_filterexpr:
            include_other = False

        if self.table.options.groupby not in self.CONFIG:
            raise ValueError('not supported for groupby=%s' %
                             self.table.options.groupby)

        config = self.CONFIG[self.table.options.groupby]

        # num_reports / cur_report are used to compute min/max pct
        num_reports = (1 +
                       (1 if self.table.options.top_n else 0) +
                       (1 if include_other else 0))
        cur_report = 0

        if self.table.options.top_n:
            # Run a top-n report to drive the criteria for each column
            query_column_defs = self.run_top_n(config, args, base_col,
                                               minpct=0,
                                               maxpct=(100/num_reports))
            cur_report += 1
        else:
            query_column_defs = self.job.criteria.query_columns
            if isinstance(query_column_defs, types.StringTypes):
                query_column_defs = json.loads(query_column_defs)

        query_columns = [col['json'] for col in query_column_defs]

        if not query_columns:
            msg = 'Unable to compute query colums for job %s' % self.job
            logger.error(msg)
            return QueryError(msg)

        with lock:
            report = TrafficTimeSeriesReport(args.profiler)
            columns = [args.columns[0], base_col.name]
            logger.info("Query Columns: %s" % str(query_columns))

            if self.table.options.groupby == 'host_group':
                host_group_type = 'ByLocation'
            else:
                host_group_type = None

            report.run(
                centricity=args.centricity,
                columns=columns,
                timefilter=args.timefilter,
                trafficexpr=args.trafficexpr,
                resolution=args.resolution,
                sync=False,
                host_group_type=host_group_type,
                query_columns_groupby=config.groupby,
                query_columns=query_columns
            )

        data = self._wait_for_data(report,
                                   minpct=cur_report * (100/num_reports),
                                   maxpct=(cur_report + 1) * (100/num_reports))
        cur_report += 1

        df = pandas.DataFrame(data,
                              columns=(['time'] + [col['name'] for
                                                   col in query_column_defs]))

        # Create ephemeral columns for all the data based
        # on the related base table
        for col in query_column_defs:
            Column.create(self.job.table, col['name'], col['label'],
                          ephemeral=self.job, datatype=base_col.datatype,
                          formatter=base_col.formatter)

        if include_other:
            # Run a separate timeseries query with no column filters
            # to get "totals" then use that to compute an "other" column

            with lock:
                report = SingleQueryReport(args.profiler)
                report.run(
                    realm='traffic_overall_time_series',
                    centricity=args.centricity,
                    groupby=args.profiler.groupbys['time'],
                    columns=columns,
                    timefilter=args.timefilter,
                    trafficexpr=args.trafficexpr,
                    resolution=args.resolution,
                    sync=False
                )

            totals = self._wait_for_data(report,
                                         minpct=cur_report * (100/num_reports),
                                         maxpct=(cur_report + 1) * (100/num_reports))

            df = df.set_index('time')
            df['subtotal'] = df.sum(axis=1)
            totals_df = (pandas.DataFrame(totals, columns=['time', 'total'])
                         .set_index('time'))

            df = df.merge(totals_df, left_index=True, right_index=True)
            df['other'] = df['total'] = df['subtotal']
            colnames = ['time'] + [col['name'] for col in query_column_defs] + ['other']

            # Drop the extraneous total and subtotal columns
            df = (df.reset_index().ix[:, colnames])

            Column.create(self.job.table, 'other', 'Other',
                          ephemeral=self.job, datatype=base_col.datatype,
                          formatter=base_col.formatter)

        logger.info("Report %s returned %s rows" % (self.job, len(df)))
        return QueryComplete(df)
def get_timestable(biztable):
    return Table.from_ref(biztable.options.tables['times'])