def test_random_events_with_daily_logging_volume_and_maximum_days(): e = RandomEvent(params={ "index": "logs-<yyyy><mm><dd>", "starting_point": "2019-01-05 15:00:00", # 1kB of data per client before we will rollover to the next day "daily_logging_volume": "8192", "number_of_days": 2, "client_count": 8, # we need a constant point in time to ensure a stable event size "__utc_now": lambda: datetime(year=2019, month=6, day=17) }, agent=StaticAgent, client_ip=StaticClientIp, referrer=StaticReferrer, request=StaticRequest) assert e.percent_completed == 0.0 # 5 events fit into one kilobyte for i in range(5): doc, index, _ = e.generate_event() assert index == "logs-20190105" assert e.percent_completed == 0.5 for i in range(5): doc, index, _ = e.generate_event() assert index == "logs-20190106" # no more events allowed on the next day with pytest.raises(StopIteration): doc, index, _ = e.generate_event() assert e.percent_completed == 1.0
def test_random_events_with_daily_logging_volume(): e = RandomEvent(params={ "index": "logs-<yyyy><mm><dd>", "starting_point": "2019-01-05 15:00:00", # 1kB of data per client before we will rollover to the next day "daily_logging_volume": "8kB", "client_count": 8, # we need a constant point in time to ensure a stable event size "__utc_now": lambda: datetime(year=2019, month=6, day=17) }, agent=StaticAgent, client_ip=StaticClientIp, referrer=StaticReferrer, request=StaticRequest) assert e.percent_completed is None # 5 events fit into one kilobyte for i in range(5): doc, index, _ = e.generate_event() assert index == "logs-20190105" for i in range(5): doc, index, _ = e.generate_event() assert index == "logs-20190106" for i in range(5): doc, index, _ = e.generate_event() assert index == "logs-20190107" assert e.percent_completed is None
def test_random_event_no_event_size_by_default(): e = RandomEvent(params={ "index": "logs", "starting_point": "2019-01-05 15:00:00", }, agent=StaticAgent, client_ip=StaticClientIp, referrer=StaticReferrer, request=StaticRequest) raw_doc, index, doc_type = e.generate_event() doc = json.loads(raw_doc) assert "_raw_event_size" not in doc assert index == "logs" assert doc_type == "doc"
def test_random_event_with_event_size(): e = RandomEvent(params={ "index": "logs", "starting_point": "2019-01-05 15:00:00", "record_raw_event_size": True, # we need a constant point in time to ensure a stable event size "__utc_now": lambda: datetime(year=2019, month=6, day=17) }, agent=StaticAgent, client_ip=StaticClientIp, referrer=StaticReferrer, request=StaticRequest) raw_doc, index, doc_type = e.generate_event() doc = json.loads(raw_doc) assert doc["_raw_event_size"] == 236 assert index == "logs" assert doc_type == "doc"
class ElasticlogsBulkSource: """ Generates a bulk indexing request for elasticlogs data. It expects the parameter hash to contain the following keys: "bulk-size" - Integer indicating events generated per bulk request. "index" - Name of index, index prefix or alias documents should be indexed into. The index name can be made to generate time based indices by including date formatting in the name. 'test-<yyyy>-<mm>-<dd>-<hh>' will generate an hourly index. (mandatory) "starting_point" - String specifying the starting point for event time generation. It supports absolute or relative values as follows: 'now' - Always evaluated to the current timestamp at time of generation 'now-1h' - Offset to the current timestamp. Consists of a number and either m (minutes), h (hours) or d (days). '2017-02-20 20:12:32' - Exact timestamp. '2017-02-20' - Date. Time will be assumed to be 00:00:00. If a relative starting point (based on now) is provided, this will be used for generation. In the case an exact timestamp is provided as starting point, the difference to now will be calculated when the generation starts and this will be used as an offset for all events. Defaults to 'now'. "acceleration_factor" - This factor allows the time progression in the timestamp calculation to be altered. A value larger than 1 will accelerate generation and a value lower than 1 will slow it down. If a task is set up to run indexing for one hour with a fixed starting point of '2016-12-20 20:12:32' and an acceleration factor of 2.0, events will be generated in timestamp sequence covering a 2-hour window, '2017-02-20 20:12:32' to '2017-02-20 22:12:32' (approximately). "id_type" - Type of document id to use for generated documents. Defaults to `auto`. auto - Do not explicitly set id and let Elasticsearch assign automatically. seq - Assign sequentialy incrementing integer ids to each document. "id_seq_probability" - If set, the probability an existing id will be used to simulate an update. Applied only when `id_type` is seq. Defaults to 0.0 which brings no updates. Must be in range [0.0, 1.0]. "id_seq_low_id_bias" - If set, favor low ids with a very high bias. Must be True/False. Default is False. """ def __init__(self, track, params, **kwargs): self.infinite = False self.orig_args = [track, params, kwargs] self._indices = track.indices self._params = params # we could also do `kwargs.get("random_event", RandomEvent(params))` but that would call the constructor eagerly # which we want to avoid because this can cause significant overhead. if "random_event" in kwargs: self._randomevent = kwargs["random_event"] else: self._randomevent = RandomEvent(params) self._bulk_size = params["bulk-size"] self.seq_id = 0 self._id_type = params.get("id_type", "auto") if self._id_type not in ["auto", "seq"]: raise AssertionError("The value [{}] is invalid for the parameter [id_type]".format(self._id_type)) if self._id_type == "seq": self._id_seq_probability = float(params.get("id_seq_probability", 0.0)) self._low_id_bias = str(params.get('id_seq_low_id_bias', False)).lower() == "true" if self._low_id_bias: logger.info("Will use low id bias for updates") else: logger.info("Will use uniform distribution for updates") self._default_index = False if "index" not in params.keys(): index_name = self._indices[0].name if len(self._indices) > 1: logger.debug("[bulk] More than one index specified in track configuration. Will use the first one ({})".format(index_name)) else: logger.debug("[bulk] Using index specified in track configuration ({})".format(index_name)) self._params["index"] = index_name self._default_index = True else: logger.debug("[bulk] Index pattern specified in parameters ({}) will be used".format(params["index"])) def partition(self, partition_index, total_partitions): if self._params.get("id_type") != "seq": seed = partition_index * self._params["seed"] if "seed" in self._params else None random.seed(seed) new_params = copy.deepcopy(self.orig_args[1]) new_params["client_id"] = partition_index new_params["client_count"] = total_partitions return ElasticlogsBulkSource(self.orig_args[0], new_params, **self.orig_args[2]) @property def percent_completed(self): # progress is determined either by: # # * the `time-period` or `iteration` property specified on the corresponding task # * `#params()` raising `StopIteration` when `RandomEvent` is exhausted return self._randomevent.percent_completed def params(self): # Build bulk array bulk_array = [] self._randomevent.start_bulk(self._bulk_size) for x in range(0, self._bulk_size): try: evt, idx, typ = self._randomevent.generate_event() except StopIteration: if len(bulk_array) > 0: # return any remaining items if there are any (otherwise we'd lose the last bulk request) break else: # otherwise stop immediately raise if self._id_type == "auto": bulk_array.append('{"index": {"_index": "%s"}}' % idx) else: docid = "%s-%d" % (self.__get_seq_id(), self._params["client_id"]) bulk_array.append('{"index": {"_index": "%s", "_id": "%s"}}' % (idx, docid)) bulk_array.append(evt) response = { "body": "\n".join(bulk_array), "action-metadata-present": True, # the bulk array contains the action-and-metadata line and the actual document "bulk-size": len(bulk_array) // 2, "unit": "docs" } if "pipeline" in self._params.keys(): response["pipeline"] = self._params["pipeline"] return response def __get_seq_id(self): _id = self.seq_id if random.uniform(0, 1) < self._id_seq_probability: # conflict if self._low_id_bias: # update; heavily bias towards older ids _p = 10 _min = 0 _max = _id # _p ~> 0: results closer to min, _p >> 0: results closer to max _id = _min + (_max - _min) * pow(random.random(), _p) else: # update; pick id from pure uniform distribution _id = random.randint(0, _id-1 if _id > 0 else 0) else: # new document self.__incr_seq_id() return "%012d" % _id def __incr_seq_id(self): self.seq_id += 1
class ElasticlogsBulkSource: """ Generates a bulk indexing request for elasticlogs data. It expects the parameter hash to contain the following keys: "bulk-size" - Integer indicating events generated per bulk request. Defaults to 1000. "index" - Name of index, index prefix or alias documents should be indexed into. The index name can be made to generate time based indices by including date formatting in the name. 'test-<yyyy>-<mm>-<dd>-<hh>' will generate an hourly index. (mandatory) "starting_point" - String specifying the starting point for event time generation. It supports absolute or relative values as follows: 'now' - Always evaluated to the current timestamp at time of generation 'now-1h' - Offset to the current timestamp. Consists of a number and either m (minutes), h (hours) or d (days). '2017-02-20 20:12:32' - Exact timestamp. '2017-02-20' - Date. Time will be assumed to be 00:00:00. If a relative starting point (based on now) is provided, this will be used for generation. In the case an exact timestamp is provided as starting point, the difference to now will be calculated when the generation starts and this will be used as an offset for all events. If an interval is provided by also specifying an end_point, the range will be calculated for each bulk request and each event will be assigned a random timestamp withion this range. starting point. Defaults to 'now'. "end_point" - String specifying the end point for event time generation. It supports absolute or relative values as follows: 'now' - Always evaluated to the current timestamp at time of generation 'now-1h' - Offset to the current timestamp. Consists of a number and either m (minutes), h (hours) or d (days). '2017-02-20 20:12:32' - Exact timestamp. '2017-02-20' - Date. Time will be assumed to be 00:00:00. When specified, the event timestamp will be generated randomly with in the interval defined by the starting_point and end_point parameters. If end_poiunt < starting_point, they will be swapped. "acceleration_factor" - This factor only applies when an exact timestamp or date has been provided as starting point and no end_point has been defined. It allows the time progression in the timestamp calculation to be altered. A value larger than 1 will accelerate generation and a value lower than 1 will slow it down. If a task is set up to run indexing for one hour with a fixed starting point of '2016-12-20 20:12:32' and an acceleration factor of 2.0, events will be generated in timestamp sequence covering a 2-hour window, '2017-02-20 20:12:32' to '2017-02-20 22:12:32' (approximately). "id_type" - Type of document id to use for generated documents. Defaults to `auto`. auto - Do not explicitly set id and let Elasticsearch assign automatically. seq - Assign sequentialy incrementing integer ids to each document. uuid - Assign a UUID4 id to each document. epoch_uuid - Assign a UUIO4 identifier prefixed with the hex representation of the current timestamp. epoch_md5 - Assign a base64 encoded MD5 hash of a UUID prefixed with the hex representation of the current timestamp. (Note: Generating this type of id can be CPU intensive) md5 - MD5 hash of UUID in hex representation. (Note: Generating this type of id can be CPU intensive) sha1 - SHA1 hash of UUID in hex representation. (Note: Generating this type of id can be CPU intensive) sha256 - SHA256 hash of UUID in hex representation. (Note: Generating this type of id can be CPU intensive) sha384 - SHA384 hash of UUID in hex representation. (Note: Generating this type of id can be CPU intensive) sha512 - SHA512 hash of UUID in hex representation. (Note: Generating this type of id can be CPU intensive) "id_seq_probability" - If set, the probability an existing id will be used to simulate an update. Applied only when `id_type` is seq. Defaults to 0.0 which brings no updates. Must be in range [0.0, 1.0]. "id_seq_low_id_bias" - If set, favor low ids with a very high bias. Must be True/False. Default is False. "id_delay_probability" - If id_type is set to `epoch_uuid` this parameter determnines the probability will be set in the past. This can be used to simulate a portion of the events arriving delayed. Must be in range [0.0, 1.0]. Defaults to 0.0. "id_delay_secs" - If an event is delayed, this number of seconds will be deducted from the current timestamp. """ def __init__(self, track, params, **kwargs): self._indices = track.indices self._params = params self._params = params self._randomevent = RandomEvent(params) self._bulk_size = 1000 if 'bulk-size' in params.keys(): self._bulk_size = params['bulk-size'] self._id_type = "auto" self.seq_id = 0 if 'id_type' in params.keys(): if params['id_type'] in [ 'auto', "seq", 'uuid', 'epoch_uuid', 'epoch_md5', 'md5', 'sha1', 'sha256', 'sha384', 'sha512' ]: self._id_type = params['id_type'] else: logger.warning( "[bulk] Invalid id_type ({}) specified. Will use default.". format(params['id_type'])) if self._id_type in ["epoch_uuid", "epoch_md5"]: if 'id_delay_probability' in params.keys(): self._id_delay_probability = float( params['id_delay_probability']) else: self._id_delay_probability = 0.0 if 'id_delay_secs' in params.keys(): self._id_delay_secs = int(params['id_delay_secs']) else: self._id_delay_secs = 0 if self._id_type == "seq": self.orig_args = [track, params, kwargs] self._id_seq_probability = float( params['id_seq_probability'] ) if 'id_seq_probability' in params else 0.0 self._low_id_bias = str(params.get('id_seq_low_id_bias', False)).lower() == "true" if self._low_id_bias: logger.info("Will use low id bias for updates") else: logger.info("Will use uniform distribution for updates") self._default_index = False if 'index' not in params.keys(): if len(self._indices) > 1: logger.debug( "[bulk] More than one index specified in track configuration. Will use the first one ({})" .format(self._indices[0].name)) else: logger.debug( "[bulk] Using index specified in track configuration ({})". format(self._indices[0].name)) self._params['index'] = self._indices[0].name self._default_index = True else: logger.debug( "[bulk] Index pattern specified in parameters ({}) will be used" .format(params['index'])) def partition(self, partition_index, total_partitions): if self._params.get("id_type") == "seq": new_params = copy.deepcopy(self.orig_args[1]) new_params["client_id"] = partition_index return ElasticlogsBulkSource(self.orig_args[0], new_params, **self.orig_args[2]) else: seed = partition_index * self._params[ "seed"] if "seed" in self._params else None random.seed(seed) return self def size(self): return 1 def params(self): # Build bulk array bulk_array = [] for x in range(0, self._bulk_size): evt, idx, typ = self._randomevent.generate_event() if self._id_type == 'auto': bulk_array.append('{"index": {"_index": "%s"}}"' % (idx)) else: if self._id_type == 'uuid': docid = self.__get_uuid() elif self._id_type == "seq": docid = "%s-%d" % (self.__get_seq_id(), self._params["client_id"]) elif self._id_type == 'sha1': docid = hashlib.sha1( self.__get_uuid().encode('utf8')).hexdigest() elif self._id_type == 'sha256': docid = hashlib.sha256( self.__get_uuid().encode('utf8')).hexdigest() elif self._id_type == 'sha384': docid = hashlib.sha384( self.__get_uuid().encode('utf8')).hexdigest() elif self._id_type == 'sha512': docid = hashlib.sha512( self.__get_uuid().encode('utf8')).hexdigest() elif self._id_type == 'md5': docid = hashlib.md5( self.__get_uuid().encode('utf8')).hexdigest() elif self._id_type == 'epoch_md5': docid = self.__get_epoch_md5() else: docid = self.__get_epoch_uuid() bulk_array.append('{"index": {"_index": "%s", "_id": "%s"}}"' % (idx, docid)) bulk_array.append(evt) response = { "body": "\n".join(bulk_array), "action-metadata-present": True, "bulk-size": self._bulk_size } if "pipeline" in self._params.keys(): response["pipeline"] = self._params["pipeline"] return response def __get_uuid(self): return str(uuid.uuid4()).replace('-', '') def __get_epoch_uuid(self): u = self.__get_uuid() ts = int(time.time()) if 0 < self._id_delay_probability < random.random(): ts = ts - self._id_delay_secs return '{:x}{}'.format(ts, u) def __get_epoch_md5(self): u = self.__get_uuid() md5_str = str( base64.urlsafe_b64encode(hashlib.md5( u.encode('utf8')).digest()))[2:24] ts = int(time.time()) if 0 < self._id_delay_probability < random.random(): ts = ts - self._id_delay_secs return hex(ts)[2:10] + md5_str def __get_seq_id(self): _id = self.seq_id if random.uniform(0, 1) < self._id_seq_probability: # conflict if self._low_id_bias: # update; heavily bias towards older ids _p = 10 _min = 0 _max = _id # _p ~> 0: results closer to min, _p >> 0: results closer to max _id = _min + (_max - _min) * pow(random.random(), _p) else: # update; pick id from pure uniform distribution _id = random.randint(0, _id - 1 if _id > 0 else 0) else: # new document self.__incr_seq_id() return "%012d" % _id def __incr_seq_id(self): self.seq_id += 1
class ElasticlogsBulkSource: """ Generates a bulk indexing request for elasticlogs data. It expects the parameter hash to contain the following keys: "bulk-size" - Integer indicating events generated per bulk request. Defaults to 1000. "index" - Name of index, index prefix or alias documents should be indexed into. The index name can be made to generate time based indices by including date formatting in the name. 'test-<yyyy>-<mm>-<dd>-<hh>' will generate an hourly index. (mandatory) "type" - String specifyting the event type. Defaults to type of index specification or if this is not present 'logs'. "starting_point" - String specifying the starting point for event time generation. It supports absolute or relative values as follows: 'now' - Always evaluated to the current timestamp at time of generation 'now-1h' - Offset to the current timestamp. Consists of a number and either m (minutes), h (hours) or d (days). '2017-02-20 20:12:32' - Exact timestamp. '2017-02-20' - Date. Time will be assumed to be 00:00:00. If a relative starting point (based on now) is provided, this will be used for generation. In the case an exact timestamp is provided as starting point, the difference to now will be calculated when the generation starts and this will be used as an offset for all events. If an interval is provided by also specifying an end_point, the range will be calculated for each bulk request and each event will be assigned a random timestamp withion this range. starting point. Defaults to 'now'. "end_point" - String specifying the end point for event time generation. It supports absolute or relative values as follows: 'now' - Always evaluated to the current timestamp at time of generation 'now-1h' - Offset to the current timestamp. Consists of a number and either m (minutes), h (hours) or d (days). '2017-02-20 20:12:32' - Exact timestamp. '2017-02-20' - Date. Time will be assumed to be 00:00:00. When specified, the event timestamp will be generated randomly with in the interval defined by the starting_point and end_point parameters. If end_poiunt < starting_point, they will be swapped. "acceleration_factor" - This factor only applies when an exact timestamp or date has been provided as starting point and no end_point has been defined. It allows the time progression in the timestamp calculation to be altered. A value larger than 1 will accelerate generation and a value lower than 1 will slow it down. If a task is set up to run indexing for one hour with a fixed starting point of '2016-12-20 20:12:32' and an acceleration factor of 2.0, events will be generated in timestamp sequence covering a 2-hour window, '2017-02-20 20:12:32' to '2017-02-20 22:12:32' (approximately). """ def __init__(self, indices, params): self._indices = indices self._params = params self._randomevent = RandomEvent(params) self._bulk_size = 1000 if 'bulk-size' in params.keys(): self._bulk_size = params['bulk-size'] self._default_index = False if 'index' not in params.keys(): if len(indices) > 1: logger.debug( "[bulk] More than one index specified in track configuration. Will use the first one ({})" .format(indices[0].name)) else: logger.debug( "[bulk] Using index specified in track configuration ({})". format(indices[0].name)) self._params['index'] = indices[0].name self._default_index = True else: logger.debug( "[bulk] Index pattern specified in parameters ({}) will be used" .format(params['index'])) if 'type' not in params.keys(): self._params['type'] = indices[0].types[0].name def partition(self, partition_index, total_partitions): return self def size(self): return 1 def params(self): # Build bulk array bulk_array = [] for x in range(0, self._bulk_size): evt, idx, typ = self._randomevent.generate_event() bulk_array.append('{"index": {"_index": "%s", "_type": "%s"}}"' % (idx, typ)) bulk_array.append(evt) response = { "body": "\n".join(bulk_array), "action_metadata_present": True, "bulk-size": self._bulk_size } if "pipeline" in self._params.keys(): response["pipeline"] = self._params["pipeline"] return response
class MetricbeatBulkSource: """ Generates a bulk indexing request for Metyricbeat data. It expects the parameter hash to contain the following keys: "bulk-size" - Integer indicating events generated per bulk request. Defaults to 1000. "index" - Name of index, index prefix or alias documents should be indexed into. The index name can be made to generate time based indices by including date formatting in the name. 'test-<yyyy>-<mm>-<dd>-<hh>' will generate an hourly index. (mandatory) "starting_point" - String specifying the starting point for event time generation. It supports absolute or relative values as follows: 'now' - Always evaluated to the current timestamp at time of generation 'now-1h' - Offset to the current timestamp. Consists of a number and either m (minutes), h (hours) or d (days). '2017-02-20 20:12:32' - Exact timestamp. '2017-02-20' - Date. Time will be assumed to be 00:00:00. If a relative starting point (based on now) is provided, this will be used for generation. In the case an exact timestamp is provided as starting point, the difference to now will be calculated when the generation starts and this will be used as an offset for all events. If an interval is provided by also specifying an end_point, the range will be calculated for each bulk request and each event will be assigned a random timestamp withion this range. starting point. Defaults to 'now'. "end_point" - String specifying the end point for event time generation. It supports absolute or relative values as follows: 'now' - Always evaluated to the current timestamp at time of generation 'now-1h' - Offset to the current timestamp. Consists of a number and either m (minutes), h (hours) or d (days). '2017-02-20 20:12:32' - Exact timestamp. '2017-02-20' - Date. Time will be assumed to be 00:00:00. When specified, the event timestamp will be generated randomly with in the interval defined by the starting_point and end_point parameters. If end_poiunt < starting_point, they will be swapped. "acceleration_factor" - This factor only applies when an exact timestamp or date has been provided as starting point and no end_point has been defined. It allows the time progression in the timestamp calculation to be altered. A value larger than 1 will accelerate generation and a value lower than 1 will slow it down. If a task is set up to run indexing for one hour with a fixed starting point of '2016-12-20 20:12:32' and an acceleration factor of 2.0, events will be generated in timestamp sequence covering a 2-hour window, '2017-02-20 20:12:32' to '2017-02-20 22:12:32' (approximately). "id_type" - Type of document id to use for generated documents. Defaults to `auto`. auto - Do not explicitly set id and let Elasticsearch assign automatically. uuid - Assign a UUID4 id to each document. epoch_uuid - Assign a UUIO4 identifier prefixed with the hex representation of the current timestamp. sha1 - SHA1 hash of UUID in hex representation. (Note: Generating this type of id can be CPU intensive) sha256 - SHA256 hash of UUID in hex representation. (Note: Generating this type of id can be CPU intensive) sha384 - SHA384 hash of UUID in hex representation. (Note: Generating this type of id can be CPU intensive) sha512 - SHA512 hash of UUID in hex representation. (Note: Generating this type of id can be CPU intensive) "id_delay_probability" - If id_type is set to `epoch_uuid` this parameter determnines the probability will be set in the past. This can be used to simulate a portion of the events arriving delayed. Must be in range [0.0, 1.0]. Defaults to 0.0. "id_delay_secs". - If an event is delayed, this number of seconds will be deducted from the current timestamp. """ def __init__(self, track, params, **kwargs): self._indices = track.indices self._params = params self._params = params self._randomevent = RandomEvent(params) self._bulk_size = 1000 if 'bulk-size' in params.keys(): self._bulk_size = params['bulk-size'] self._id_type = "auto" if 'id_type' in params.keys(): if params['id_type'] in [ 'auto', 'uuid', 'epoch_uuid', 'sha1', 'sha256', 'sha384', 'sha512' ]: self._id_type = params['id_type'] else: logger.warning( "[bulk] Invalid id_type ({}) specified. Will use default.". format(params['id_type'])) if self._id_type == "epoch_uuid": if 'id_delay_probability' in params.keys(): self._id_delay_probability = float( params['id_delay_probability']) else: self._id_delay_probability = 0.0 if 'id_delay_secs' in params.keys(): self._id_delay_secs = int(params['id_delay_secs']) else: self._id_delay_secs = 0 self._default_index = False if 'index' not in params.keys(): if len(self._indices) > 1: logger.debug( "[bulk] More than one index specified in track configuration. Will use the first one ({})" .format(self._indices[0].name)) else: logger.debug( "[bulk] Using index specified in track configuration ({})". format(self._indices[0].name)) self._params['index'] = self._indices[0].name self._default_index = True else: logger.debug( "[bulk] Index pattern specified in parameters ({}) will be used" .format(params['index'])) def partition(self, partition_index, total_partitions): seed = partition_index * self._params[ "seed"] if "seed" in self._params else None random.seed(seed) return self def size(self): return 1 def params(self): # Build bulk array bulk_array = [] for x in range(0, self._bulk_size): evt, idx, typ = self._randomevent.generate_event() if self._id_type == 'auto': bulk_array.append('{"index": {"_index": "%s"}}"' % (idx)) else: if self._id_type == 'uuid': docid = self.__get_uuid() elif self._id_type == 'sha1': docid = hashlib.sha1( self.__get_uuid().encode()).hexdigest() elif self._id_type == 'sha256': docid = hashlib.sha256( self.__get_uuid().encode()).hexdigest() elif self._id_type == 'sha384': docid = hashlib.sha384( self.__get_uuid().encode()).hexdigest() elif self._id_type == 'sha512': docid = hashlib.sha512( self.__get_uuid().encode()).hexdigest() else: docid = self.__get_epoch_uuid() bulk_array.append('{"index": {"_index": "%s", "_id": "%s"}}"' % (idx, docid)) bulk_array.append(evt) response = { "body": "\n".join(bulk_array), "action-metadata-present": True, "bulk-size": self._bulk_size } if "pipeline" in self._params.keys(): response["pipeline"] = self._params["pipeline"] return response def __get_uuid(self): u = str(uuid.uuid4()) return u[0:8] + u[9:13] + u[14:18] + u[19:23] + u[24:36] def __get_epoch_uuid(self): u = self.__get_uuid() ts = int(time.time()) if (self._id_delay_probability > 0 and self._id_delay_probability < random.random()): ts = ts - self._id_delay_secs return hex(ts)[2:10] + u
'raw', ]) except getopt.GetoptError as err: print('ERROR:', err) sys.exit(1) for opt, arg in options: if opt in ('-c', '--count'): if arg.isdigit(): documents_to_generate = int(arg) if documents_to_generate < 1: print( "ERROR: -c/--count must be followed by a positive integer." ) sys.exit(0) else: print("ERROR: -c/--count must be followed by a positive integer.") sys.exit(0) elif opt in ('-r', '--raw'): raw_mode = True randomevent = RandomEvent({}) for k in range(documents_to_generate): evt, idx, typ = randomevent.generate_event() if raw_mode: print(evt['message']) else: print(json.dumps(evt))