Example #1
0
def test_generate_interval_and_skip():
    clock = ReproducibleClock(start=datetime.datetime(year=2019,
                                                      month=1,
                                                      day=5,
                                                      hour=15),
                              delta=datetime.timedelta(seconds=1))

    g = TimestampStructGenerator(starting_point="2018-05-01:00:59:56",
                                 acceleration_factor=3.0,
                                 utcnow=clock)

    assert g.next_timestamp() == {
        "iso": "2018-05-01T00:59:59.000Z",
        "iso_prefix": "2018-05-01T00:59:59",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "01",
        "hh": "00"
    }

    assert g.next_timestamp() == {
        "iso": "2018-05-01T01:00:02.000Z",
        "iso_prefix": "2018-05-01T01:00:02",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "01",
        "hh": "01"
    }

    g.skip(datetime.timedelta(days=1))

    assert g.next_timestamp() == {
        "iso": "2018-05-02T00:59:59.000Z",
        "iso_prefix": "2018-05-02T00:59:59",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "02",
        "hh": "00"
    }

    assert g.next_timestamp() == {
        "iso": "2018-05-02T01:00:02.000Z",
        "iso_prefix": "2018-05-02T01:00:02",
        "yyyy": "2018",
        "yy": "18",
        "mm": "05",
        "dd": "02",
        "hh": "01"
    }
Example #2
0
class RandomEvent:
    def __init__(self,
                 params,
                 agent=Agent,
                 client_ip=ClientIp,
                 referrer=Referrer,
                 request=Request):
        self._agent = agent()
        self._clientip = client_ip()
        self._referrer = referrer()
        self._request = request()
        # We will reuse the event dictionary. This assumes that each field will be present (and thus overwritten) in each event.
        # This reduces object churn and improves peak indexing throughput.
        self._event = {}

        if "index" in params:
            index = re.sub(r"<\s*yyyy\s*>",
                           "{ts[yyyy]}",
                           params["index"],
                           flags=re.IGNORECASE)
            index = re.sub(r"<\s*yy\s*>",
                           "{ts[yy]}",
                           index,
                           flags=re.IGNORECASE)
            index = re.sub(r"<\s*mm\s*>",
                           "{ts[mm]}",
                           index,
                           flags=re.IGNORECASE)
            index = re.sub(r"<\s*dd\s*>",
                           "{ts[dd]}",
                           index,
                           flags=re.IGNORECASE)
            index = re.sub(r"<\s*hh\s*>",
                           "{ts[hh]}",
                           index,
                           flags=re.IGNORECASE)

            self._index = index
            self._index_pattern = True
        else:
            self._index = "elasticlogs"
            self._index_pattern = False

        self._type = "doc"
        self._timestamp_generator = TimestampStructGenerator(
            params.get("starting_point", "now"),
            params.get("offset"),
            float(params.get("acceleration_factor", "1.0")),
            # this is only expected to be used in tests
            params.get("__utc_now"))
        if "daily_logging_volume" in params and "client_count" in params:
            # in bytes
            self.daily_logging_volume = convert_to_bytes(
                params["daily_logging_volume"]) // int(params["client_count"])
        else:
            self.daily_logging_volume = None
        self.current_logging_volume = 0
        self.total_days = params.get("number_of_days")
        self.remaining_days = self.total_days
        self.record_raw_event_size = params.get("record_raw_event_size", False)
        self._offset = 0
        self._web_host = itertools.cycle([1, 2, 3])
        self._timestruct = None
        self._index_name = None
        self._time_interval_current_bulk = 0

    @property
    def percent_completed(self):
        if self.daily_logging_volume is None or self.total_days is None:
            return None
        else:
            full_days = self.total_days - self.remaining_days
            already_generated = self.daily_logging_volume * full_days + self.current_logging_volume
            total = self.total_days * self.daily_logging_volume
            return already_generated / total

    def start_bulk(self, bulk_size):
        self._time_interval_current_bulk = 1 / bulk_size
        self._timestruct = self._timestamp_generator.next_timestamp()
        self._index_name = self.__generate_index_pattern(self._timestruct)

    def generate_event(self):
        if self.remaining_days == 0:
            raise StopIteration()

        # advance time by a few micros
        self._timestruct = self._timestamp_generator.simulate_tick(
            self._time_interval_current_bulk)
        # index for the current line - we may cross a date boundary later if we're above the daily logging volume
        index = self._index_name
        event = self._event
        event["@timestamp"] = self._timestruct["iso"]

        # assume a typical event size of 263 bytes but limit the file size to 4GB
        event["offset"] = (self._offset + 263) % (4 * 1024 * 1024 * 1024)

        self._agent.add_fields(event)
        self._clientip.add_fields(event)
        self._referrer.add_fields(event)
        self._request.add_fields(event)

        event["hostname"] = "web-%s-%s.elastic.co" % (
            event["geoip_continent_code"], next(self._web_host))

        if self.record_raw_event_size or self.daily_logging_volume:
            # determine the raw event size (as if this were contained in nginx log file). We do not bother to
            # reformat the timestamp as this is not worth the overhead.
            raw_event = '%s - - [%s] "%s %s HTTP/%s" %s %s "%s" "%s"' % (
                event["clientip"], event["@timestamp"], event["verb"],
                event["request"], event["httpversion"], event["response"],
                event["bytes"], event["referrer"], event["agent"])
            if self.daily_logging_volume:
                self.current_logging_volume += len(raw_event)
                if self.current_logging_volume > self.daily_logging_volume:
                    if self.remaining_days is not None:
                        self.remaining_days -= 1
                    self._timestamp_generator.skip(datetime.timedelta(days=1))
                    # advance time now for real (we usually use #simulate_tick() which will keep everything except for
                    # microseconds constant.
                    self._timestruct = self._timestamp_generator.next_timestamp(
                    )
                    self._index_name = self.__generate_index_pattern(
                        self._timestruct)
                    self.current_logging_volume = 0

        if self.record_raw_event_size:
            # we are on the hot code path here and thus we want to avoid conditionally creating strings so we duplicate
            # the event.
            line = '{"@timestamp": "%s", ' \
                   '"_raw_event_size":%d, ' \
                   '"offset":%s, ' \
                   '"source":"/usr/local/var/log/nginx/access.log","fileset":{"module":"nginx","name":"access"},"input":{"type":"log"},' \
                   '"beat":{"version":"6.3.0","hostname":"%s","name":"%s"},' \
                   '"prospector":{"type":"log"},' \
                   '"nginx":{"access":{"user_name": "-",' \
                   '"agent":"%s","user_agent": {"major": "%s","os": "%s","os_major": "%s","name": "%s","os_name": "%s","device": "%s"},' \
                   '"remote_ip": "%s","remote_ip_list":["%s"],' \
                   '"geoip":{"continent_name": "%s","city_name": "%s","country_name": "%s","country_iso_code": "%s","location":{"lat": %s,"lon": %s} },' \
                   '"referrer":"%s",' \
                   '"url": "%s","body_sent":{"bytes": %s},"method":"%s","response_code":%s,"http_version":"%s"} } }' % \
                   (event["@timestamp"],
                    len(raw_event),
                    event["offset"],
                    event["hostname"],event["hostname"],
                    event["agent"], event["useragent_major"], event["useragent_os"], event["useragent_os_major"], event["useragent_name"], event["useragent_os_name"], event["useragent_device"],
                    event["clientip"], event["clientip"],
                    event["geoip_continent_name"], event["geoip_city_name"], event["geoip_country_name"], event["geoip_country_iso_code"], event["geoip_location_lat"], event["geoip_location_lon"],
                    event["referrer"],
                    event["request"], event["bytes"], event["verb"], event["response"], event["httpversion"])
        else:
            line = '{"@timestamp": "%s", ' \
                   '"offset":%s, ' \
                   '"source":"/usr/local/var/log/nginx/access.log","fileset":{"module":"nginx","name":"access"},"input":{"type":"log"},' \
                   '"beat":{"version":"6.3.0","hostname":"%s","name":"%s"},' \
                   '"prospector":{"type":"log"},' \
                   '"nginx":{"access":{"user_name": "-",' \
                   '"agent":"%s","user_agent": {"major": "%s","os": "%s","os_major": "%s","name": "%s","os_name": "%s","device": "%s"},' \
                   '"remote_ip": "%s","remote_ip_list":["%s"],' \
                   '"geoip":{"continent_name": "%s","city_name": "%s","country_name": "%s","country_iso_code": "%s","location":{"lat": %s,"lon": %s} },' \
                   '"referrer":"%s",' \
                   '"url": "%s","body_sent":{"bytes": %s},"method":"%s","response_code":%s,"http_version":"%s"} } }' % \
                   (event["@timestamp"],
                    event["offset"],
                    event["hostname"],event["hostname"],
                    event["agent"], event["useragent_major"], event["useragent_os"], event["useragent_os_major"], event["useragent_name"], event["useragent_os_name"], event["useragent_device"],
                    event["clientip"], event["clientip"],
                    event["geoip_continent_name"], event["geoip_city_name"], event["geoip_country_name"], event["geoip_country_iso_code"], event["geoip_location_lat"], event["geoip_location_lon"],
                    event["referrer"],
                    event["request"], event["bytes"], event["verb"], event["response"], event["httpversion"])

        return line, index, self._type

    def __generate_index_pattern(self, timestruct):
        if self._index_pattern:
            return self._index.format(ts=timestruct)
        else:
            return self._index