Esempio n. 1
0
    def test_get_events_from_data_frame(self):
        """Test getting all events from data frame."""
        lines = [
            {
                '_id': '123',
                '_type': 'manual',
                '_index': 'asdfasdf',
                'tool': 'isskeid'
            },
            {
                '_id': '124',
                '_type': 'manual',
                '_index': 'asdfasdf',
                'tool': 'tong'
            },
            {
                '_id': '125',
                '_type': 'manual',
                '_index': 'asdfasdf',
                'tool': 'klemma'
            },
        ]
        frame = pd.DataFrame(lines)

        events = list(utils.get_events_from_data_frame(frame, None))
        self.assertEqual(len(events), 3)
        ids = [x.event_id for x in events]
        self.assertEqual(set(ids), set(['123', '124', '125']))
Esempio n. 2
0
    def run(self):
        """Entry point for the analyzer.

        Returns:
            String with summary of the analyzer result
        """
        # TODO: Once we can identify user generated events this should be
        # updated to include all user generated events instead of focusing
        # solely on browser events.
        query = 'source_short:"WEBHIST" OR source:"WEBHIST"'

        return_fields = ['timestamp', 'url', 'tag', '__ts_emojis']

        data_frame = self.event_pandas(
            query_string=query, return_fields=return_fields)

        if not data_frame.shape[0]:
            return 'No browser events discovered.'

        sleeping_emoji = emojis.get_emoji('SLEEPING_FACE')

        # This query filters out all timestamps that have a zero timestamp as
        # well as those that occure after 2038-01-01, this may need to be
        # changed in the future.
        data_frame = data_frame[
            (data_frame.timestamp > 0) & (
                data_frame.timestamp < 2145916800000000)]
        data_frame['timestamp'] = pd.to_numeric(data_frame.timestamp)
        data_frame['datetime'] = pd.to_datetime(
            data_frame.timestamp / 1e6, utc=True, unit='s')
        data_frame['hour'] = pd.to_numeric(
            data_frame.datetime.dt.strftime('%H'))

        total_count = data_frame.shape[0]
        activity_hours, threshold, aggregation = get_active_hours(data_frame)

        if not activity_hours:
            return 'Did not discover any activity hours.'

        hour_count = dict(aggregation.values.tolist())
        data_frame_outside = data_frame[~data_frame.hour.isin(activity_hours)]

        for event in utils.get_events_from_data_frame(
                data_frame_outside, self.datastore):
            event.add_tags(['outside-active-hours'])
            hour = event.source.get('hour')
            this_hour_count = hour_count.get(hour)
            event.add_attributes(
                {'activity_summary': (
                    'Number of events for this hour ({0:d}): {1:d}, with the '
                    'threshold value: {2:0.2f}').format(
                        hour, this_hour_count, threshold),
                 'hour_count': this_hour_count})
            event.add_emojis([sleeping_emoji])
            event.commit()

        return (
            'Tagged {0:d} out of {1:d} events as outside of normal '
            'active hours.').format(data_frame_outside.shape[0], total_count)
Esempio n. 3
0
    def test_get_events_from_data_frame(self):
        """Test getting all events from data frame."""
        lines = [
            {"_id": "123", "_type": "manual", "_index": "asdfasdf", "tool": "isskeid"},
            {"_id": "124", "_type": "manual", "_index": "asdfasdf", "tool": "tong"},
            {"_id": "125", "_type": "manual", "_index": "asdfasdf", "tool": "klemma"},
        ]
        frame = pd.DataFrame(lines)

        events = list(utils.get_events_from_data_frame(frame, None))
        self.assertEqual(len(events), 3)
        ids = [x.event_id for x in events]
        self.assertEqual(set(ids), set(["123", "124", "125"]))
Esempio n. 4
0
    def test_get_events_from_data_frame(self):
        """Test getting all events from data frame."""
        lines = [
            {'_id': '123', '_type': 'manual', '_index': 'asdfasdf',
             'tool': 'isskeid'},
            {'_id': '124', '_type': 'manual', '_index': 'asdfasdf',
             'tool': 'tong'},
            {'_id': '125', '_type': 'manual', '_index': 'asdfasdf',
             'tool': 'klemma'},
        ]
        frame = pd.DataFrame(lines)

        events = list(utils.get_events_from_data_frame(frame, None))
        self.assertEqual(len(events), 3)
        ids = [x.event_id for x in events]
        self.assertEqual(set(ids), set(['123', '124', '125']))
Esempio n. 5
0
    def run(self):
        """Entry point for the analyzer.

        Returns:
            String with summary of the analyzer result
        """
        # TODO: Once we can identify user generated events this should be
        # updated to include all user generated events instead of focusing
        # solely on browser events.
        query = 'source_short:"WEBHIST" OR source:"WEBHIST"'

        return_fields = ['datetime', 'timestamp', 'url', 'tag', '__ts_emojis']

        data_frame = self.event_pandas(query_string=query,
                                       return_fields=return_fields)

        if not data_frame.shape[0]:
            return 'No browser events discovered.'

        sleeping_emoji = emojis.get_emoji('SLEEPING_FACE')

        # This query filters out all timestamps that have a zero timestamp as
        # well as those that occur after 2038-01-01, this may need to be
        # changed in the future.
        data_frame['timestamp'] = pd.to_numeric(data_frame.timestamp)
        data_frame = data_frame[(data_frame.timestamp > 0)
                                & (data_frame.timestamp < 2145916800000000)]

        data_frame['datetime'] = pd.to_datetime(data_frame.timestamp / 1e6,
                                                utc=True,
                                                unit='s')
        data_frame['hour'] = pd.to_numeric(
            data_frame.datetime.dt.strftime('%H'))

        total_count = data_frame.shape[0]
        activity_hours, threshold, aggregation = get_active_hours(data_frame)

        if not activity_hours:
            return 'Did not discover any activity hours.'

        hour_count = dict(aggregation.values.tolist())
        data_frame_outside = data_frame[~data_frame.hour.isin(activity_hours)]

        for event in utils.get_events_from_data_frame(data_frame_outside,
                                                      self.datastore):
            event.add_tags(['outside-active-hours'])
            hour = event.source.get('hour')
            this_hour_count = hour_count.get(hour)
            event.add_attributes({
                'activity_summary':
                ('Number of events for this hour ({0:d}): {1:d}, with the '
                 'threshold value: {2:0.2f}').format(hour, this_hour_count,
                                                     threshold),
                'hour_count':
                this_hour_count
            })
            event.add_emojis([sleeping_emoji])
            event.commit()

        tagged_events, _ = data_frame_outside.shape
        if tagged_events:
            story = self.sketch.add_story('{0:s} - {1:s}'.format(
                utils.BROWSER_STORY_TITLE, self.timeline_name))
            story.add_text(utils.BROWSER_STORY_HEADER, skip_if_exists=True)

            # Find some statistics about the run time of the analyzer.
            percent = (tagged_events / total_count) * 100.0
            last_hour = activity_hours[0]
            end = 0
            for hour in activity_hours[1:]:
                if hour != last_hour + 1:
                    end = hour
                    break
                last_hour = hour

            if not end:
                first = activity_hours[0]
                last = activity_hours[-1]
            else:
                first = end
                index = activity_hours.index(end)
                last = activity_hours[index - 1]

            story.add_text(
                '## Browser Timeframe Analyzer\n\nThe browser timeframe '
                'analyzer discovered {0:d} browser events that occurred '
                'outside of the typical browsing window of this browser '
                'history ({1:s}), or around {2:0.2f}%  of the {3:d} total '
                'events.\n\nThe analyzer determines the activity hours by '
                'finding the frequency of browsing events per hour, and then '
                'discovering the longest block of most active hours before '
                'proceeding with flagging all events outside of that time '
                'period. This information can be used by other analyzers '
                'or by manually looking for other activity within the '
                'inactive time period to find unusual actions.\n\n'
                'The hours considered to be active hours are the hours '
                'between {4:02d} and {5:02d} (hours in UTC) and the '
                'threshold used to determine if an hour was considered to be '
                'active was: {6:0.2f}.'.format(tagged_events,
                                               self.timeline_name, percent,
                                               total_count, first, last,
                                               threshold))

            group = self.sketch.add_aggregation_group(
                name='Browser Activity Per Hour',
                description='Created by the browser timeframe analyzer')
            group.set_layered()

            params = {
                'data':
                aggregation.to_dict(orient='records'),
                'title':
                'Browser Activity Per Hour ({0:s})'.format(self.timeline_name),
                'field':
                'hour',
                'order_field':
                'hour',
            }
            agg_obj = self.sketch.add_aggregation(
                name='Browser Activity Per Hour ({0:s})'.format(
                    self.timeline_name),
                agg_name='manual_feed',
                agg_params=params,
                chart_type='barchart',
                description='Created by the browser timeframe analyzer',
                label='informational')
            group.add_aggregation(agg_obj)

            lines = [{'hour': x, 'count': threshold} for x in range(0, 24)]
            params = {
                'data':
                lines,
                'title':
                'Browser Timeframe Threshold ({0:s})'.format(
                    self.timeline_name),
                'field':
                'hour',
                'order_field':
                'hour',
                'chart_color':
                'red',
            }
            agg_line = self.sketch.add_aggregation(
                name='Browser Activity Per Hour ({0:s})'.format(
                    self.timeline_name),
                agg_name='manual_feed',
                agg_params=params,
                chart_type='linechart',
                description='Created by the browser timeframe analyzer',
                label='informational')
            group.add_aggregation(agg_line)
            story.add_aggregation_group(group)

        return ('Tagged {0:d} out of {1:d} events as outside of normal '
                'active hours.').format(tagged_events, total_count)