def __init__(self,
                 params,
                 agent=Agent,
                 client_ip=ClientIp,
                 referrer=Referrer,
                 request=Request):
        self._agent = agent()
        self._clientip = client_ip()
        self._referrer = referrer()
        self._request = request()
        # We will reuse the event dictionary. This assumes that each field will be present (and thus overwritten) in each event.
        # This reduces object churn and improves peak indexing throughput.
        self._event = {}

        if "index" in params:
            index = re.sub(r"<\s*yyyy\s*>",
                           "{ts[yyyy]}",
                           params["index"],
                           flags=re.IGNORECASE)
            index = re.sub(r"<\s*yy\s*>",
                           "{ts[yy]}",
                           index,
                           flags=re.IGNORECASE)
            index = re.sub(r"<\s*mm\s*>",
                           "{ts[mm]}",
                           index,
                           flags=re.IGNORECASE)
            index = re.sub(r"<\s*dd\s*>",
                           "{ts[dd]}",
                           index,
                           flags=re.IGNORECASE)
            index = re.sub(r"<\s*hh\s*>",
                           "{ts[hh]}",
                           index,
                           flags=re.IGNORECASE)

            self._index = index
            self._index_pattern = True
        else:
            self._index = "elasticlogs"
            self._index_pattern = False

        self._type = "doc"
        self._timestamp_generator = TimestampStructGenerator(
            params.get("starting_point", "now"),
            params.get("offset"),
            float(params.get("acceleration_factor", "1.0")),
            # this is only expected to be used in tests
            params.get("__utc_now"))
        if "daily_logging_volume" in params and "client_count" in params:
            # in bytes
            self.daily_logging_volume = convert_to_bytes(
                params["daily_logging_volume"]) // int(params["client_count"])
        else:
            self.daily_logging_volume = None
        self.current_logging_volume = 0
        self.total_days = params.get("number_of_days")
        self.remaining_days = self.total_days
        self.record_raw_event_size = params.get("record_raw_event_size", False)
Example #2
0
def test_generate_interval_and_skip():
    clock = ReproducibleClock(start=datetime.datetime(year=2019,
                                                      month=1,
                                                      day=5,
                                                      hour=15),
                              delta=datetime.timedelta(seconds=1))

    g = TimestampStructGenerator(starting_point="2018-05-01:00:59:56",
                                 acceleration_factor=3.0,
                                 utcnow=clock)

    assert g.next_timestamp() == {
        "iso": "2018-05-01T00:59:59.000Z",
        "iso_prefix": "2018-05-01T00:59:59",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "01",
        "hh": "00"
    }

    assert g.next_timestamp() == {
        "iso": "2018-05-01T01:00:02.000Z",
        "iso_prefix": "2018-05-01T01:00:02",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "01",
        "hh": "01"
    }

    g.skip(datetime.timedelta(days=1))

    assert g.next_timestamp() == {
        "iso": "2018-05-02T00:59:59.000Z",
        "iso_prefix": "2018-05-02T00:59:59",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "02",
        "hh": "00"
    }

    assert g.next_timestamp() == {
        "iso": "2018-05-02T01:00:02.000Z",
        "iso_prefix": "2018-05-02T01:00:02",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "02",
        "hh": "01"
    }
    def __init__(self, params):
        self._agent = Agent()
        self._clientip = ClientIp()
        self._referrer = Referrer()
        self._request = Request()
        # We will reuse the event dictionary. This assumes that each field will be present (and thus overwritten) in each event.
        # This reduces object churn and improves peak indexing throughput.
        self._event = {}

        self._index = 'elasticlogs'
        self._index_pattern = False
        if 'index' in params.keys():
            index = re.sub(r'<\s*yyyy\s*>', '{ts[yyyy]}', params['index'], flags=re.IGNORECASE)
            index = re.sub(r'<\s*yy\s*>', '{ts[yy]}', index, flags=re.IGNORECASE)
            index = re.sub(r'<\s*mm\s*>', '{ts[mm]}', index, flags=re.IGNORECASE)
            index = re.sub(r'<\s*dd\s*>', '{ts[dd]}', index, flags=re.IGNORECASE)
            index = re.sub(r'<\s*hh\s*>', '{ts[hh]}', index, flags=re.IGNORECASE)

            self._index = index
            self._index_pattern = True

        self._type = 'doc'

        if 'starting_point' in params.keys():
            sp = params['starting_point']
        else:
            sp = "now"

        if 'end_point' in params.keys():
            ep = params['end_point']
            self._timestamp_generator = TimestampStructGenerator.Interval(sp, ep)
        else:
            if 'acceleration_factor' in params.keys():
                af = float(params['acceleration_factor'])
                self._timestamp_generator = TimestampStructGenerator.StartingPoint(sp, af)
            else:
                self._timestamp_generator = TimestampStructGenerator.StartingPoint(sp)

        self._delete_fields = []
        if 'delete_fields' in params.keys():
            for d in params['delete_fields']:
                self._delete_fields.append(d.split('.'))
Example #4
0
def test_generate_interval_from_now():
    clock = ReproducibleClock(start=datetime.datetime(year=2019,
                                                      month=1,
                                                      day=5,
                                                      hour=15),
                              delta=datetime.timedelta(seconds=5))

    g = TimestampStructGenerator(starting_point="now", utcnow=clock)

    # first generated timestamp will be one (clock) invocation after the original start
    assert g.next_timestamp() == {
        "iso": "2019-01-05T15:00:05.000Z",
        "iso_prefix": "2019-01-05T15:00:05",
        "yyyy": "2019",
        "yy": "19",
        "mm": "01",
        "dd": "05",
        "hh": "15"
    }

    assert g.next_timestamp() == {
        "iso": "2019-01-05T15:00:10.000Z",
        "iso_prefix": "2019-01-05T15:00:10",
        "yyyy": "2019",
        "yy": "19",
        "mm": "01",
        "dd": "05",
        "hh": "15"
    }

    assert g.next_timestamp() == {
        "iso": "2019-01-05T15:00:15.000Z",
        "iso_prefix": "2019-01-05T15:00:15",
        "yyyy": "2019",
        "yy": "19",
        "mm": "01",
        "dd": "05",
        "hh": "15"
    }
def test_generate_interval_from_fixed_starting_point_with_offset():
    clock = ReproducibleClock(start=datetime.datetime(year=2019,
                                                      month=1,
                                                      day=5,
                                                      hour=15),
                              delta=datetime.timedelta(seconds=1))

    g = TimestampStructGenerator(starting_point="2018-05-01:00:59:56",
                                 offset="+10d",
                                 acceleration_factor=3.0,
                                 utcnow=clock)

    assert g.next_timestamp() == {
        "iso": "2018-05-11T00:59:59.000Z",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "11",
        "hh": "00"
    }

    assert g.next_timestamp() == {
        "iso": "2018-05-11T01:00:02.000Z",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "11",
        "hh": "01"
    }
    assert g.next_timestamp() == {
        "iso": "2018-05-11T01:00:05.000Z",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "11",
        "hh": "01"
    }
Example #6
0
def test_simulate_ticks():
    clock = ReproducibleClock(start=datetime.datetime(year=2019,
                                                      month=1,
                                                      day=5,
                                                      hour=15),
                              delta=datetime.timedelta(seconds=1))

    g = TimestampStructGenerator(starting_point="2018-05-01:00:59:56",
                                 acceleration_factor=3.0,
                                 utcnow=clock)

    assert g.next_timestamp() == {
        "iso": "2018-05-01T00:59:59.000Z",
        "iso_prefix": "2018-05-01T00:59:59",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "01",
        "hh": "00"
    }

    assert g.simulate_tick(micros=1.0) == {
        "iso": "2018-05-01T00:59:59.001Z",
        "iso_prefix": "2018-05-01T00:59:59",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "01",
        "hh": "00"
    }

    assert g.simulate_tick(micros=0.1) == {
        "iso": "2018-05-01T00:59:59.001Z",
        "iso_prefix": "2018-05-01T00:59:59",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "01",
        "hh": "00"
    }

    assert g.simulate_tick(micros=10.0) == {
        "iso": "2018-05-01T00:59:59.011Z",
        "iso_prefix": "2018-05-01T00:59:59",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "01",
        "hh": "00"
    }
def test_generate_invalid_time_interval():
    # "w" is unsupported
    with pytest.raises(TimeParsingError) as ex:
        TimestampStructGenerator(starting_point="now+1w")

    assert "Invalid time format: now+1w" == str(ex.value)
Example #8
0
class RandomEvent:
    def __init__(self,
                 params,
                 agent=Agent,
                 client_ip=ClientIp,
                 referrer=Referrer,
                 request=Request):
        self._agent = agent()
        self._clientip = client_ip()
        self._referrer = referrer()
        self._request = request()
        # We will reuse the event dictionary. This assumes that each field will be present (and thus overwritten) in each event.
        # This reduces object churn and improves peak indexing throughput.
        self._event = {}

        if "index" in params:
            index = re.sub(r"<\s*yyyy\s*>",
                           "{ts[yyyy]}",
                           params["index"],
                           flags=re.IGNORECASE)
            index = re.sub(r"<\s*yy\s*>",
                           "{ts[yy]}",
                           index,
                           flags=re.IGNORECASE)
            index = re.sub(r"<\s*mm\s*>",
                           "{ts[mm]}",
                           index,
                           flags=re.IGNORECASE)
            index = re.sub(r"<\s*dd\s*>",
                           "{ts[dd]}",
                           index,
                           flags=re.IGNORECASE)
            index = re.sub(r"<\s*hh\s*>",
                           "{ts[hh]}",
                           index,
                           flags=re.IGNORECASE)

            self._index = index
            self._index_pattern = True
        else:
            self._index = "elasticlogs"
            self._index_pattern = False

        self._type = "doc"
        self._timestamp_generator = TimestampStructGenerator(
            params.get("starting_point", "now"),
            params.get("offset"),
            float(params.get("acceleration_factor", "1.0")),
            # this is only expected to be used in tests
            params.get("__utc_now"))
        if "daily_logging_volume" in params and "client_count" in params:
            # in bytes
            self.daily_logging_volume = convert_to_bytes(
                params["daily_logging_volume"]) // int(params["client_count"])
        else:
            self.daily_logging_volume = None
        self.current_logging_volume = 0
        self.total_days = params.get("number_of_days")
        self.remaining_days = self.total_days
        self.record_raw_event_size = params.get("record_raw_event_size", False)
        self._offset = 0
        self._web_host = itertools.cycle([1, 2, 3])
        self._timestruct = None
        self._index_name = None
        self._time_interval_current_bulk = 0

    @property
    def percent_completed(self):
        if self.daily_logging_volume is None or self.total_days is None:
            return None
        else:
            full_days = self.total_days - self.remaining_days
            already_generated = self.daily_logging_volume * full_days + self.current_logging_volume
            total = self.total_days * self.daily_logging_volume
            return already_generated / total

    def start_bulk(self, bulk_size):
        self._time_interval_current_bulk = 1 / bulk_size
        self._timestruct = self._timestamp_generator.next_timestamp()
        self._index_name = self.__generate_index_pattern(self._timestruct)

    def generate_event(self):
        if self.remaining_days == 0:
            raise StopIteration()

        # advance time by a few micros
        self._timestruct = self._timestamp_generator.simulate_tick(
            self._time_interval_current_bulk)
        # index for the current line - we may cross a date boundary later if we're above the daily logging volume
        index = self._index_name
        event = self._event
        event["@timestamp"] = self._timestruct["iso"]

        # assume a typical event size of 263 bytes but limit the file size to 4GB
        event["offset"] = (self._offset + 263) % (4 * 1024 * 1024 * 1024)

        self._agent.add_fields(event)
        self._clientip.add_fields(event)
        self._referrer.add_fields(event)
        self._request.add_fields(event)

        event["hostname"] = "web-%s-%s.elastic.co" % (
            event["geoip_continent_code"], next(self._web_host))

        if self.record_raw_event_size or self.daily_logging_volume:
            # determine the raw event size (as if this were contained in nginx log file). We do not bother to
            # reformat the timestamp as this is not worth the overhead.
            raw_event = '%s - - [%s] "%s %s HTTP/%s" %s %s "%s" "%s"' % (
                event["clientip"], event["@timestamp"], event["verb"],
                event["request"], event["httpversion"], event["response"],
                event["bytes"], event["referrer"], event["agent"])
            if self.daily_logging_volume:
                self.current_logging_volume += len(raw_event)
                if self.current_logging_volume > self.daily_logging_volume:
                    if self.remaining_days is not None:
                        self.remaining_days -= 1
                    self._timestamp_generator.skip(datetime.timedelta(days=1))
                    # advance time now for real (we usually use #simulate_tick() which will keep everything except for
                    # microseconds constant.
                    self._timestruct = self._timestamp_generator.next_timestamp(
                    )
                    self._index_name = self.__generate_index_pattern(
                        self._timestruct)
                    self.current_logging_volume = 0

        if self.record_raw_event_size:
            # we are on the hot code path here and thus we want to avoid conditionally creating strings so we duplicate
            # the event.
            line = '{"@timestamp": "%s", ' \
                   '"_raw_event_size":%d, ' \
                   '"offset":%s, ' \
                   '"source":"/usr/local/var/log/nginx/access.log","fileset":{"module":"nginx","name":"access"},"input":{"type":"log"},' \
                   '"beat":{"version":"6.3.0","hostname":"%s","name":"%s"},' \
                   '"prospector":{"type":"log"},' \
                   '"nginx":{"access":{"user_name": "-",' \
                   '"agent":"%s","user_agent": {"major": "%s","os": "%s","os_major": "%s","name": "%s","os_name": "%s","device": "%s"},' \
                   '"remote_ip": "%s","remote_ip_list":["%s"],' \
                   '"geoip":{"continent_name": "%s","city_name": "%s","country_name": "%s","country_iso_code": "%s","location":{"lat": %s,"lon": %s} },' \
                   '"referrer":"%s",' \
                   '"url": "%s","body_sent":{"bytes": %s},"method":"%s","response_code":%s,"http_version":"%s"} } }' % \
                   (event["@timestamp"],
                    len(raw_event),
                    event["offset"],
                    event["hostname"],event["hostname"],
                    event["agent"], event["useragent_major"], event["useragent_os"], event["useragent_os_major"], event["useragent_name"], event["useragent_os_name"], event["useragent_device"],
                    event["clientip"], event["clientip"],
                    event["geoip_continent_name"], event["geoip_city_name"], event["geoip_country_name"], event["geoip_country_iso_code"], event["geoip_location_lat"], event["geoip_location_lon"],
                    event["referrer"],
                    event["request"], event["bytes"], event["verb"], event["response"], event["httpversion"])
        else:
            line = '{"@timestamp": "%s", ' \
                   '"offset":%s, ' \
                   '"source":"/usr/local/var/log/nginx/access.log","fileset":{"module":"nginx","name":"access"},"input":{"type":"log"},' \
                   '"beat":{"version":"6.3.0","hostname":"%s","name":"%s"},' \
                   '"prospector":{"type":"log"},' \
                   '"nginx":{"access":{"user_name": "-",' \
                   '"agent":"%s","user_agent": {"major": "%s","os": "%s","os_major": "%s","name": "%s","os_name": "%s","device": "%s"},' \
                   '"remote_ip": "%s","remote_ip_list":["%s"],' \
                   '"geoip":{"continent_name": "%s","city_name": "%s","country_name": "%s","country_iso_code": "%s","location":{"lat": %s,"lon": %s} },' \
                   '"referrer":"%s",' \
                   '"url": "%s","body_sent":{"bytes": %s},"method":"%s","response_code":%s,"http_version":"%s"} } }' % \
                   (event["@timestamp"],
                    event["offset"],
                    event["hostname"],event["hostname"],
                    event["agent"], event["useragent_major"], event["useragent_os"], event["useragent_os_major"], event["useragent_name"], event["useragent_os_name"], event["useragent_device"],
                    event["clientip"], event["clientip"],
                    event["geoip_continent_name"], event["geoip_city_name"], event["geoip_country_name"], event["geoip_country_iso_code"], event["geoip_location_lat"], event["geoip_location_lon"],
                    event["referrer"],
                    event["request"], event["bytes"], event["verb"], event["response"], event["httpversion"])

        return line, index, self._type

    def __generate_index_pattern(self, timestruct):
        if self._index_pattern:
            return self._index.format(ts=timestruct)
        else:
            return self._index
Example #9
0
    def __init__(self, track, params, **kwargs):
        self._indices = track.indices
        self._params = params
        self._samples = []
        self._next_index = 0

        self._bulk_size = 1000
        if 'bulk-size' in params.keys():
            self._bulk_size = params['bulk-size']

        self._default_index = False
        if 'index' not in params.keys():
            if len(self._indices) > 1:
                logger.debug(
                    "[bulk] More than one index specified in track configuration. Will use the first one ({})"
                    .format(self._indices[0].name))
            else:
                logger.debug(
                    "[bulk] Using index specified in track configuration ({})".
                    format(self._indices[0].name))

            self._params['index'] = self._indices[0].name
            self._default_index = True

        else:
            logger.debug(
                "[bulk] Index pattern specified in parameters ({}) will be used"
                .format(params['index']))

        if 'type' not in params.keys():
            t = self._indices[0].types[0]
            self._params['type'] = t if isinstance(t, str) else t.name

        if 'timestamp_field' not in params.keys():
            self._params['timestamp_field'] = []
        else:
            if isinstance(params['timestamp_field'], list):
                self._timestamp_field = params['timestamp_field']
            else:
                self._timestamp_field = [params['timestamp_field']]

        if 'sample_file' not in params.keys():
            raise ConfigurationError(
                'Sample file(s) not supplied through the sample_file configuration parameter.'
            )
        else:
            if isinstance(params['sample_file'], list):
                self._params['sample_file'] = params['sample_file']
            else:
                self._params['sample_file'] = [params['sample_file']]

        records = load_json_file.load_data_files(self._params['sample_file'])

        logger.info("[sample_based_bulk] {} samples loaded.".format(
            len(records)))

        for rec in records:
            sample = {}

            if '_type' in rec.keys():
                sample['type'] = rec['_type']
                rec.pop('_type', None)

            if '_index' in rec.keys():
                sample['index'] = rec['_index']
                rec.pop('_index', None)

            sample['record'] = rec
            self._samples.append(sample)

        self._index = 'logs'
        self._index_pattern = False
        if 'index' in params.keys():
            index = re.sub(r'{{\s*yyyy\s*}}',
                           '{ts[yyyy]}',
                           params['index'],
                           flags=re.IGNORECASE)
            index = re.sub(r'{{\s*yy\s*}}',
                           '{ts[yy]}',
                           index,
                           flags=re.IGNORECASE)
            index = re.sub(r'{{\s*mm\s*}}',
                           '{ts[mm]}',
                           index,
                           flags=re.IGNORECASE)
            index = re.sub(r'{{\s*dd\s*}}',
                           '{ts[dd]}',
                           index,
                           flags=re.IGNORECASE)
            index = re.sub(r'{{\s*hh\s*}}',
                           '{ts[hh]}',
                           index,
                           flags=re.IGNORECASE)
            self._index = index
            self._index_pattern = True

        self._type = 'logs'
        if 'type' in params.keys():
            self._type = params['type']

        if 'starting_point' in params.keys():
            sp = params['starting_point']
        else:
            sp = "now"

        if 'end_point' in params.keys():
            ep = params['end_point']
            self._timestamp_generator = TimestampStructGenerator.Interval(
                sp, ep)
        else:
            if 'acceleration_factor' in params.keys():
                af = float(params['acceleration_factor'])
                self._timestamp_generator = TimestampStructGenerator.StartingPoint(
                    sp, af)
            else:
                self._timestamp_generator = TimestampStructGenerator.StartingPoint(
                    sp)