def process(self, start_time:datetime, end_time:datetime, input:DataFrame): logger.debug('Start: %s End: %s Log: index=%s fields=%s' % (start_time.isoformat(), end_time.isoformat(), str(self.indices), str(self.fields))) search = Search(using=self.client, index=self.indices[0]) search = search.filter(Range(** {'@timestamp': {'gte': start_time.isoformat(), 'lte': end_time.isoformat()}})) for k,v in self.fields.items(): if isinstance(v, list): for sv in v: search = search.query("match", **{k:sv}) else: search = search.query("match", **{k:v}) logger.debug('ES Query: %s' % str(search.to_dict())) response = search.execute() logger.debug('Results: success:%d failed:%d hits:%d' % (response._shards.successful, response._shards.failed, len(response.hits))) for hit in response: # filter out the meta key and flatten the values row = {k: str(hit[k]) for k in hit if k != 'meta'} logger.debug(row) input = input.append(row, ignore_index=True) return input
def list_transactions(self, account_id: str, since: datetime.datetime = None, before: datetime.datetime = None, limit: int = None) -> List[Transaction]: """ List recent transactions for the account :param account_id: account id :param before: only list transactions before that date :param since: only list transactions after that date :param limit: only show a number of transactions :return: A list of Transaction objects """ params = { 'account_id': account_id, 'expand[]': 'merchant' } if since: params.update({'since': since.isoformat('T')+'Z'}) if before: params.update({'before': before.isoformat('T')+'Z'}) if limit: params.update({'limit': limit}) response = self._make_request('/transactions', params) return [ Transaction(client=self, **transaction) for transaction in response['transactions'] ]
def get_eventcount(self, bucket_id: str, limit: int=100, start: datetime=None, end: datetime=None) -> int: endpoint = "buckets/{}/events/count".format(bucket_id) params = dict() # type: Dict[str, str] if start is not None: params["start"] = start.isoformat() if end is not None: params["end"] = end.isoformat() response = self._get(endpoint, params=params) return int(response.text)
def get_events(self, bucket_id: str, limit: int=100, start: datetime=None, end: datetime=None) -> List[Event]: endpoint = "buckets/{}/events".format(bucket_id) params = dict() # type: Dict[str, str] if limit is not None: params["limit"] = str(limit) if start is not None: params["start"] = start.isoformat() if end is not None: params["end"] = end.isoformat() events = self._get(endpoint, params=params).json() return [Event(**event) for event in events]
def _to_isoformat(value: datetime) -> str: if not value: return '' text = value.isoformat() if text.endswith(_ZERO_ISO_TIME_PART): return text[0:-len(_ZERO_ISO_TIME_PART)] return text
def populate_full_water_temperature(request): obj = SoapCalls() DataFormat = 'json' sensors = obj.GetSensors(DataFormat) sDate = DT(2016, 1, 1) eDate = sDate + datetime.timedelta(days=9) stationNumbers = get_station_num(sensors, 'WT') temperature_data = None count = 0 while eDate < DT.today(): count = count + 1 print ("**** Writing batch: %s" % count) sDate = eDate + datetime.timedelta(days=1) eDate = eDate + datetime.timedelta(days=10) for x in stationNumbers: temperature_x = obj.GetTimeSeriesData(x, ['WT'], sDate.isoformat(), eDate.isoformat(), DataFormat) for x in temperature_x['TimeSeriesData']: wt_datetime = DT.strptime(x['TimeStamp'], '%m/%d/%Y %I:%M:%S %p') try: wt = WT(station_id=x['StationID'], station_name=x['StationName'], timestamp=wt_datetime, value=x['Value']) wt.save() except Exception: continue return HttpResponse('Written to database FULL')
def query(self, query: str, start: datetime, end: datetime, name: str=None, cache: bool=False) -> Union[int, dict]: endpoint = "query/" params = {} # type: Dict[str, Any] if cache: if not name: raise Exception("You are not allowed to do caching without a query name") params["name"] = name params["cache"] = int(cache) data = { 'timeperiods': ["/".join([start.isoformat(), end.isoformat()])], 'query': query.split("\n") } response = self._post(endpoint, data, params=params) if response.text.isdigit(): return int(response.text) else: return response.json()
def get_datetime_string(dt: datetime = None, with_timezone: bool = True) -> typing.Optional[str]: if not dt: dt = datetime.now() if with_timezone: dt = dt.astimezone() s = dt.isoformat() return s
def format_datetime(__datetime: datetime.datetime) -> str: """Format ISO-8601 datetime string. Args: __datetime: Datetime to process Returns: ISO-8601 compatible string """ return __datetime.isoformat().replace('+00:00', 'Z')
def dateTimeAsRFC3339Text(dateTime: DateTime) -> str: """ Convert a :class:`DateTime` into an RFC 3339 formatted date-time string. :param dateTime: A non-naive :class:`DateTime` to convert. :return: An RFC 3339 formatted date-time string corresponding to :obj:`dateTime`. """ return dateTime.isoformat()
async def list_transactions_async(self, account_id: str, since: datetime.datetime = None, before: datetime.datetime = None, limit: int = None): params = { 'account_id': account_id, } if since: params.update({'since': since.isoformat('T')+'Z'}) if before: params.update({'before': before.isoformat('T')+'Z'}) if limit: params.update({'limit': str(limit)}) content = await self._make_async_request('/transactions', params) return [ Transaction(client=self, **transaction) for transaction in content['transactions'] ]
def add_appointment(self, start: datetime_type, end: datetime_type, location: str, description: str, doctor_ids: list, nurse_ids: list, patient_ids: list): """ Add an appointment to be exported :param start: The start date of the appointment :param end: The end date of the appointment :param location: The physical location of the appointment :param description: A description of the appointment :param doctor_ids: A list of ids of the doctors atteniding :param nurse_ids: A list of ids of the nurses attending :param patient_ids: A list of ids of the patients attending :return: None """ self.__export_scheme['appointments'] += [{ 'start': start.isoformat(), 'end': end.isoformat(), 'location': location, 'description': description, 'doctor_ids': doctor_ids, 'nurse_ids': nurse_ids, 'patient_ids': patient_ids }]
def set_maintenance_overlay(self, next_maintenance_time: datetime) -> None: """ Once per maintenance window and user: show an overlay that the user must actively dismiss once. """ saw_deploy_time = self.request.session.get(self.session_key, '') try: saw_deploy_time = parse_datetime(saw_deploy_time) except ValueError: saw_deploy_time = None if not saw_deploy_time or saw_deploy_time < now(): self.request.session[self.session_key] = \ next_maintenance_time.isoformat() text = self.get_full_maintenance_text(next_maintenance_time) self.overlay.update({'maintenance_warn_overlay': text})
def schedule_nagbot_message(message: str, short_message: str, deadline: datetime, policies: List[dict], uid: str = None) -> str: ''' Instantiates a new message to be sent repeatedly by NagBot :param message: Long description of message (ie email body) :param short_message: Short description of message (ie email subject, IRC message) :param deadline: Message expiry date :param policies: Notification policies described in dict format :param uid: Optionally specify tracking uid. A random uid will be generated if not given :return: Tracking uid for the notification ''' for policy in policies: verify_policy_structure(policy) if uid is None: uid = generate_random_uid() request_url = current_app.config['RELENG_NOTIFICATION_POLICY_URL'] + '/message/' + uid message_body = json.dumps({ 'deadline': deadline.isoformat(), 'message': message, 'shortMessage': short_message, 'policies': policies, }) hawk = mohawk.Sender(get_current_app_credentials(), request_url, 'put', content=message_body, content_type='application/json') headers = { 'Authorization': hawk.request_header, 'Content-Type': 'application/json', } # Support dev ssl ca cert ssl_dev_ca = current_app.config.get('SSL_DEV_CA') if ssl_dev_ca is not None: assert os.path.isdir(ssl_dev_ca), 'SSL_DEV_CA must be a dir with hashed dev ca certs' response = put(request_url, headers=headers, data=message_body, verify=ssl_dev_ca) response.raise_for_status() return uid
def _do_retrieval(self, updates_since: datetime) -> UpdateCollection: """ Handles the retrieval of updates by getting the data using the retriever, notifying the listeners and then logging the retrieval. :param updates_since: the time from which to retrieve updates since :return: the updates retrieved """ logging.debug("Starting update retrieval...") # Do retrieve started_at_clock_time = RetrievalManager._get_clock_time() started_at = RetrievalManager._get_monotonic_time() updates = self.update_mapper.get_all_since(updates_since) seconds_taken_to_complete_query = RetrievalManager._get_monotonic_time() - started_at assert updates is not None logging.debug("Retrieved %d updates since %s (query took: %s)" % (len(updates), updates_since, seconds_taken_to_complete_query)) # Notify listeners of retrieval if len(updates) > 0: logging.debug("Notifying %d listeners of %d update(s)" % (len(self.get_listeners()), len(updates))) self.notify_listeners(updates) # Store log of retrieval most_recent_retrieved = updates.get_most_recent()[0].timestamp if len(updates) > 0 else None self._logger.record( MEASURED_RETRIEVAL, { MEASURED_RETRIEVAL_UPDATES_SINCE: updates_since.isoformat(), MEASURED_RETRIEVAL_STARTED_AT: started_at_clock_time.isoformat(), MEASURED_RETRIEVAL_DURATION: seconds_taken_to_complete_query, MEASURED_RETRIEVAL_UPDATE_COUNT: len(updates), MEASURED_RETRIEVAL_MOST_RECENT_RETRIEVED: None if most_recent_retrieved is None else most_recent_retrieved.isoformat() } ) return updates
def save_plugin_timestamp(self, name: str, timestamp: datetime) -> None: self.workflow.data.plugins_timestamps[name] = timestamp.isoformat()
def process(self, start_time:datetime, end_time:datetime, input:DataFrame): logger.debug('Start: %s End: %s Event: fields=%s' % (start_time.isoformat(), end_time.isoformat(), str(self.fields))) return input
def _get_datafile_name(self, timestamp: datetime): assert timestamp.tzinfo is None return '{}/{}{}'.format(self.path, timestamp.isoformat(), FILE_EXT)
def zulu_time(dt: datetime.datetime): return dt.isoformat()[:-6] + "Z"
def _convert_from_python(self, value: datetime.datetime, state) -> str: return value.isoformat()
def make_time(t: datetime) -> Dict: return { 'value': t.isoformat() + 'Z', 'format': 'RFC3339' }
def datetime_to_json(self, t: datetime.datetime) -> Optional[str]: if t is None: return return t.isoformat()
def get_complete_latencies( self, topology_id: str, cluster: str, environ: str, start: dt.datetime, end: dt.datetime, **kwargs: Union[str, int, float], ) -> pd.DataFrame: """ Gets the complete latencies, as a timeseries, for every instance of the of all the spout components of the specified topology. The start and end times define the window over which to gather the metrics. The window duration should be less than 3 hours as this is the limit of what the Topology master stores. Arguments: topology_id (str): The topology identification string. cluster (str): The cluster the topology is running in. environ (str): The environment the topology is running in (eg. prod, devel, test, etc). start (datetime): utc datetime instance for the start of the metrics gathering period. end (datetime): utc datetime instance for the end of the metrics gathering period. Returns: pandas.DataFrame: A DataFrame containing the service time measurements as a timeseries. Each row represents a measurement (aggregated over one minute) with the following columns: * timestamp: The UTC timestamp for the metric, * component: The component this metric comes from, * task: The instance ID number for the instance that the metric comes from, * container: The ID for the container this metric comes from, stream: The name of the incoming stream from which the tuples that lead to this metric came from, * latency_ms: The average execute latency measurement in milliseconds for that metric time period. Raises: RuntimeWarning: If the specified topology has a reliability mode that does not enable complete latency. """ LOG.info( "Getting complete latencies for topology %s over a %d second " "period from %s to %s", topology_id, (end - start).total_seconds(), start.isoformat(), end.isoformat(), ) logical_plan, start_time, end_time = self._query_setup( topology_id, cluster, environ, start, end ) # First we need to check that the supplied topology will actually have # complete latencies. Only ATLEAST_ONCE and EXACTLY_ONCE will have # complete latency values as acking is disabled for ATMOST_ONCE. physical_plan: Dict[str, Any] = tracker.get_physical_plan( self.tracker_url, cluster, environ, topology_id ) if physical_plan["config"]["topology.reliability.mode"] == "ATMOST_ONCE": rm_msg: str = ( f"Topology {topology_id} reliability mode is set " f"to ATMOST_ONCE. Complete latency is not " f"available for these types of topologies" ) LOG.warning(rm_msg) warnings.warn(rm_msg, RuntimeWarning) return pd.DataFrame() output: pd.DataFrame = None spouts: Dict[str, Any] = logical_plan["spouts"] for spout_component in spouts: try: spout_complete_latencies: pd.DataFrame = self.get_spout_complete_latencies( topology_id, cluster, environ, spout_component, start_time, end_time, logical_plan, ) except HTTPError as http_error: LOG.warning( "Fetching execute latencies for component %s " "failed with status code %s", spout_component, str(http_error.response.status_code), ) if output is None: output = spout_complete_latencies else: output = output.append(spout_complete_latencies, ignore_index=True) return output
def get_execute_counts( self, topology_id: str, cluster: str, environ: str, start: dt.datetime, end: dt.datetime, **kwargs: Union[str, int, float], ) -> pd.DataFrame: """ Gets the execute counts, as a timeseries, for every instance of each of the components of the specified topology. The start and end times define the window over which to gather the metrics. The window duration should be less than 3 hours as this is the limit of what the Topology master stores. Arguments: topology_id (str): The topology identification string. start (datetime): UTC datetime instance for the start of the metrics gathering period. end (datetime): UTC datetime instance for the end of the metrics gathering period. **cluster (str): The cluster the topology is running in. **environ (str): The environment the topology is running in (eg. prod, devel, test, etc). Returns: pandas.DataFrame: A DataFrame containing the service time measurements as a timeseries. Each row represents a measurement (aggregated over one minute) with the following columns: * timestamp: The UTC timestamp for the metric, * component: The component this metric comes from, * task: The instance ID number for the instance that the metric comes from, * container: The ID for the container this metric comes from. * stream: The name of the incoming stream from which the tuples that lead to this metric came from, * source_component: The name of the component the stream's source instance belongs to, * execute_count: The execute count during the metric time period. """ LOG.info( "Getting execute counts for topology %s over a %d second " "period from %s to %s", topology_id, (end - start).total_seconds(), start.isoformat(), end.isoformat(), ) logical_plan, start_time, end_time = self._query_setup( topology_id, cluster, environ, start, end ) output: pd.DataFrame = None for component in logical_plan["bolts"].keys(): try: comp_execute_counts: pd.DataFrame = self.get_component_execute_counts( topology_id, cluster, environ, component, start_time, end_time, logical_plan, ) except HTTPError as http_error: LOG.warning( "Fetching execute counts for component %s failed " "with status code %s", component, str(http_error.response.status_code), ) if output is None: output = comp_execute_counts else: output = output.append(comp_execute_counts, ignore_index=True) return output
def _from_date(obj: datetime): return obj.isoformat()
def log_same(self, t0: datetime.datetime, t1: datetime.datetime, code: str) -> None: with open(os.path.join(self.log_path, '000000_nochange_queries.yaml'), 'a') as f: body = {'t0': t0.isoformat(), 't1': t1.isoformat(), 'code': code} f.write(yaml.dump(body, explicit_start=True))
def datetime_encoder(date: datetime.datetime): representation = date.isoformat() if representation.endswith('+00:00'): representation = representation[:-6] + 'Z' return representation
def isoformat(datetime: dt.datetime) -> str: """Return the ISO8601-formatted representation of a datetime object. :param datetime datetime: The datetime. """ return datetime.isoformat()
def to_json(self, value: datetime) -> str: return value.isoformat(timespec='microseconds')
def _build_token_value(self, user_id: str, timestamp: datetime, secret: '') -> str: time_bytes = timestamp.isoformat().encode('UTF-8') secret_bytes = secret.encode('UTF-8', 'replace') user_bytes = user_id.encode('UTF-8', 'replace') return time_bytes + secret_bytes + user_bytes
def format_time(self, dt: datetime): if self._system: return self._system.format_time(dt) return dt.isoformat(sep=" ", timespec="seconds") + " UTC"
def get_arrival_rates( self, topology_id: str, cluster: str, environ: str, start: dt.datetime, end: dt.datetime, **kwargs: Union[str, int, float], ) -> pd.DataFrame: """ Gets the arrival rates, as a timeseries, for every instance of each of the bolt components of the specified topology. The start and end times define the window over which to gather the metrics. The window duration should be less than 3 hours as this is the limit of what the Topology master stores. Arguments: topology_id (str): The topology identification string. start (datetime): utc datetime instance for the start of the metrics gathering period. end (datetime): utc datetime instance for the end of the metrics gathering period. **cluster (str): The cluster the topology is running in. **environ (str): The environment the topology is running in (eg. prod, devel, test, etc). Returns: pandas.DataFrame: A DataFrame containing the arrival rate measurements as a timeseries. Each row represents a measurement (aggregated over one minute) with the following columns: * timestamp: The UTC timestamp for the metric, * component: The component this metric comes from, * task: The instance ID number for the instance that the metric comes from, * container: The ID for the container this metric comes from, * arrival_count: The number of arrivals (across all streams) at each instance. * arrival_rate_tps: The arrival rate at each instance (across all streams) in units of tuples per second. """ LOG.info( "Getting arrival rates for topology %s over a %d second " "period from %s to %s", topology_id, (end - start).total_seconds(), start.isoformat(), end.isoformat(), ) execute_counts: pd.DataFrame = self.get_execute_counts( topology_id, cluster, environ, start, end ) arrivals: pd.DataFrame = ( execute_counts.groupby(["task", "component", "timestamp"]) .sum() .reset_index() .rename(index=str, columns={"execute_count": "arrival_count"}) ) arrivals["arrival_rate_tps"] = arrivals["arrival_count"] / DEFAULT_METRIC_PERIOD return arrivals
def format_datetime(dt: datetime, with_tz=True, timespec="microseconds") -> str: dt = normalise_dt(dt) dt = dt.isoformat(timespec=timespec) if with_tz: dt = dt + "Z" return dt
async def search(self, start: datetime, end: datetime) -> bool: return await self.es.search(index=self.index,body={'query':{'range':{'@timestamp':{'gte':start.isoformat(),'lt':'now'}}}})
def generate_bullets(search_start: datetime, detailed: bool = False): akst = tz.tzoffset('AKST', timedelta(hours=-9)) aknow = datetime.now(akst) search_start = search_start.astimezone(akst) meta = { 'title': 'Tools Team bullets', 'description': f"Tools team bullets for {search_start.isoformat(timespec='seconds')}" f" through {aknow.isoformat(timespec='seconds')}", } log.info(f'Generating {meta["description"]}') gh = GhApi() release_details = {} dev_prs = {} open_prs = {} opened_issues = {} for repo in tqdm(gh.repos.list_for_org('ASFHyP3')): # FIXME: Returns issues and PRs... simpler to filter this one list or make the three calls? for issue in gh.issues.list_for_repo( repo.owner.login, repo.name, state='open', sort='created', direction='desc', since=search_start.isoformat(timespec='seconds')): if issue.get('pull_request') is None: opened_issues[issue.id] = util.get_details(issue) try: last_release = parse_date( gh.repos.get_latest_release(repo.owner.login, repo.name).created_at) for release in gh.repos.list_releases(repo.owner.login, repo.name): created_at = parse_date(release.created_at) if created_at >= search_start: release_details[ release.target_commitish] = util.get_details(release) else: break except HTTP404NotFoundError: last_release = search_start # FIXME: might be able to use issues.list_for_repo with since=... to simplify logic for pull in gh.pulls.list(repo.owner.login, repo.name, state='closed', base='develop', sort='updated', direction='desc'): merged_at = pull.get('merged_at') if merged_at and parse_date(merged_at) > max( search_start, last_release): dev_prs[pull.merge_commit_sha] = util.get_details(pull) for pull in gh.pulls.list(repo.owner.login, repo.name, state='open', sort='created', direction='desc'): open_prs[pull.head.sha] = util.get_details(pull) template = 'report_detailed.md.j2' if detailed else 'report.md.j2' report_name = 'report_detailed.md' if detailed else 'report.md' report = util.render_template( template, releases=release_details, meta=meta, dev_prs=dev_prs, open_prs=open_prs, opened_issues=opened_issues, ) with open(report_name, 'w') as f: f.write(report)
def dump(self, value: datetime) -> str: return value.isoformat()
def datetime_to_json(data: datetime.datetime) -> str: return data.isoformat()
def serialize_datetime(value: datetime) -> str: return value.isoformat()
def create_bucket(self, bucket_id: str, type: str, client: str, hostname: str, created: datetime = datetime.now(timezone.utc), name: Optional[str] = None) -> "Bucket": self.logger.info("Creating bucket '{}'".format(bucket_id)) self.storage_strategy.create_bucket(bucket_id, type, client, hostname, created.isoformat(), name=name) return self[bucket_id]
def default(self, to_encode: datetime) -> str: return to_encode.isoformat()
def convert_dttm(cls, target_type: str, dttm: datetime) -> str: return ( """TO_TIMESTAMP('{}', 'YYYY-MM-DD"T"HH24:MI:SS.ff6')""").format( dttm.isoformat())
def _datetime_to_iso(self, d: datetime) -> str: isoformat = d.isoformat() return isoformat
def purge_old_data( instance: Recorder, purge_before: datetime, repack: bool, apply_filter: bool = False, events_batch_size: int = DEFAULT_EVENTS_BATCHES_PER_PURGE, states_batch_size: int = DEFAULT_STATES_BATCHES_PER_PURGE, ) -> bool: """Purge events and states older than purge_before. Cleans up an timeframe of an hour, based on the oldest record. """ _LOGGER.debug( "Purging states and events before target %s", purge_before.isoformat(sep=" ", timespec="seconds"), ) using_sqlite = instance.dialect_name == SupportedDialect.SQLITE with session_scope(session=instance.get_session()) as session: # Purge a max of MAX_ROWS_TO_PURGE, based on the oldest states or events record has_more_to_purge = False if _purging_legacy_format(session): _LOGGER.debug( "Purge running in legacy format as there are states with event_id remaining" ) has_more_to_purge |= _purge_legacy_format( instance, session, purge_before, using_sqlite ) else: _LOGGER.debug( "Purge running in new format as there are NO states with event_id remaining" ) # Once we are done purging legacy rows, we use the new method has_more_to_purge |= _purge_states_and_attributes_ids( instance, session, states_batch_size, purge_before, using_sqlite ) has_more_to_purge |= _purge_events_and_data_ids( instance, session, events_batch_size, purge_before, using_sqlite ) statistics_runs = _select_statistics_runs_to_purge(session, purge_before) short_term_statistics = _select_short_term_statistics_to_purge( session, purge_before ) if statistics_runs: _purge_statistics_runs(session, statistics_runs) if short_term_statistics: _purge_short_term_statistics(session, short_term_statistics) if has_more_to_purge or statistics_runs or short_term_statistics: # Return false, as we might not be done yet. _LOGGER.debug("Purging hasn't fully completed yet") return False if apply_filter and _purge_filtered_data(instance, session) is False: _LOGGER.debug("Cleanup filtered data hasn't fully completed yet") return False _purge_old_recorder_runs(instance, session, purge_before) if repack: repack_database(instance) return True
def clone_jhu_at_time(checkout_time: datetime.datetime, workdir: os.PathLike): """Obtain history JHU dataset by git cloning at a time point in the past. Parameters ---------- checkout_time : datetime.datetime A timezone-aware datetime object, representing a time point in the past workdir : os.PathLike a directory to which the data may be cloned. For example a tempfile.TemporaryDirectory Returns ------- fp_confirmed : os.PathLike path to time_series_covid19_confirmed_global.csv fp_deaths : os.PathLike path to time_series_covid19_deaths_global.csv fp_recovered : os.PathLike path to time_series_covid19_recovered_global.csv """ if not checkout_time.tzinfo: raise ValueError('The [checkout_time] must be timezone-aware!') # clone repodir = pathlib.Path(workdir, 'jhu_repo') repo = 'https://github.com/CSSEGISandData/COVID-19' _log.info(f'Cloning "{repo}" to "{repodir}"') _log.debug( subprocess.run(['git', 'clone', repo, 'jhu_repo'], cwd=workdir, stdout=subprocess.PIPE, encoding='utf8').stdout) # find the commit hash that was relevant at the selected date checkout_time = checkout_time.isoformat() _log.info(f'Finding the last commit before {checkout_time}') commit_id = subprocess.run( ['git', 'rev-list', '-1', f'--until="{checkout_time}"', 'master'], cwd=repodir, stdout=subprocess.PIPE, encoding='utf8').stdout.strip() if len(commit_id) != 40: raise Exception( f'Failed to find a valid commit id before the specified checkout_time ({checkout_time})' ) _log.info(f'Checking out commit {commit_id}') _log.debug( subprocess.run( ['git', 'checkout', commit_id], cwd=repodir, stdout=subprocess.PIPE, encoding='utf8', # this step is important - we must not fail silently check=True)) dp_ts = pathlib.Path(repodir, 'csse_covid_19_data', 'csse_covid_19_time_series') fp_confirmed = pathlib.Path(dp_ts, 'time_series_covid19_confirmed_global.csv') fp_deaths = pathlib.Path(dp_ts, 'time_series_covid19_deaths_global.csv') fp_recovered = pathlib.Path(dp_ts, 'time_series_covid19_recovered_global.csv') return fp_confirmed, fp_deaths, fp_recovered
def get_spout_state( metrics_client: HeronMetricsClient, topology_id: str, cluster: str, environ: str, tracker_url: str, start: dt.datetime, end: dt.datetime, metrics_sample_period: float, summary_method: str = "median", **kwargs: Union[str, int, float]) -> Dict[int, Dict[str, float]]: """ Helper script that will fetch the median or mean spout emission rates and format them into the dictionary structure expected by the topology performance prediction methods. Arguments: metrics_client (HeronMetricsClient): The client for the metrics database. topology_id (str): The topology identification string. cluster (str): The cluster that that the topology is running on. environ (str): The environment that the topology is running in. tracker_url (str): The URL for the Heron Tracker API> start (datetime): The UTC datetime for the start of the metrics gathering period. end (datetime): The UTC datetime for the start of the metrics gathering period. metrics_sample_period (float): The period that metrics are sampled into. eg 60 secs (1 min), 300 secs (5 mins). summary_method (str): The method to use to summerise the emit counts. Either "mean" to "median". Defaults to median. **kwargs: Any additional keyword arguments required by the metrics client. Returns: Dict[int, Dict[str, float]]: A dictionary mapping from task ID to a dict that maps from output stream name to an emission rate in tuples per second. """ LOG.info( "Getting spout emission state dictionary for topology %s over a" "period of %d seconds from %s to %s", topology_id, (end - start).total_seconds(), start.isoformat(), end.isoformat()) lplan: Dict[str, Any] = tracker.get_logical_plan(tracker_url, cluster, environ, topology_id) emit_counts: pd.DataFrame = metrics_client.get_emit_counts( topology_id, cluster, environ, start, end, **kwargs) spout_groups: pd.core.groupby.DataFrameGroupBy = \ (emit_counts[emit_counts["component"].isin(lplan["spouts"])] .groupby(["task", "stream"])) if summary_method == "median": spout_emits: pd.Series = spout_groups.emit_count.median() elif summary_method == "mean": spout_emits = spout_groups.emit_count.mean() else: msg: str = f"Unknown summary method: {summary_method}" LOG.error(msg) raise RuntimeError(msg) output: DefaultDict[int, Dict[str, float]] = defaultdict(dict) for (task_id, stream), emit_count in spout_emits.iteritems(): output[task_id][stream] = emit_count / metrics_sample_period return dict(output)
def calculate( graph_client: GremlinClient, metrics_client: HeronMetricsClient, topology_id: str, cluster: str, environ: str, topology_ref: str, start: dt.datetime, end: dt.datetime, io_bucket_length: int, tracker_url: str, spout_state: Dict[int, Dict[str, float]], **kwargs: Union[str, int, float]) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Arguments: graph_client (GremlinClient): The client instance for the graph database. metrics_client (HeronMetricsClient): The client instance for the metrics database. topology_id (str): The topology identification string. cluster: (str): The cluster the topology is running on. environ (str): The environment the topology is running in. topology_ref (str): The reference string for the topology physical graph to be used in the calculations. start (dt.datetime): The UTC datetime instance representing the start of the metric gathering window. end (dt.datetime): The UTC datetime instance representing the end of the metric gathering window. io_bucket_length (int): The length in seconds that metrics should be aggregated for use in IO ratio calculations. tracker_url (str): The URL for the Heron Tracker API spout_state (dict): A dictionary mapping from instance task id to a dictionary that maps from output stream name to the output rate for that spout instance. The units of this rate (TPS, TPM etc) will be the same for the arrival rates. **kwargs: Any additional key word arguments required by the metrics client query methods. NOTE: This is passed to a cached method so all kwargs must be hashable. Un-hashable arguments will be removed before being supplied. Returns: pd.DataFrame: A DataFrame containing the arrival rate at each instance. pd.DataFrame: A DataFrame containing the input and output rate of each stream manager. Raises: RuntimeError: If there is no entry in the graph database for the supplied topology id and ref. """ # First check that there is a physical graph for the supplied reference in # the graph database graph_client.raise_if_missing(topology_id, topology_ref) LOG.info( "Calculating arrival rates for topology %s reference %s using " "metrics from a %d second period from %s to %s", topology_id, topology_ref, (end - start).total_seconds(), start.isoformat(), end.isoformat(), ) i2i_rps, levels, coefficients, sending_instances, receiving_instances = _setup_arrival_calcs( metrics_client, graph_client, topology_id, cluster, environ, topology_ref, start, end, io_bucket_length, tracker_url, **kwargs) topo_traversal: GraphTraversalSource = graph_client.topology_subgraph( topology_id, topology_ref) arrival_rates: ARRIVAL_RATES = defaultdict(lambda: defaultdict(float)) output_rates: OUTPUT_RATES = defaultdict(dict) output_rates.update(spout_state) # Step through the tree levels and calculate the output from each level and # the arrivals at the next. Skip the final level as its arrival rates are # calculated in the previous step and it has not outputs. for level_number, level in enumerate(levels[:-1]): LOG.debug("Processing topology level %d", level_number) if level_number != 0: # If this is not a spout level then we need to calculate the output # from the instances in this level. for source_vertex in level: output_rates = _calculate_outputs( topo_traversal, source_vertex, arrival_rates, output_rates, coefficients, ) # Calculate the arrival rates at the instances down stream on the next # level down for source_vertex in level: arrival_rates = _calculate_arrivals(topo_traversal, source_vertex, arrival_rates, output_rates, i2i_rps) # At this stage we have the output and arrival amount for all logically # connected elements. We now need to map these on to the stream managers to # calculate their incoming and outgoing tuple rates. strmgr_in_out: pd.DataFrame = _calc_strmgr_in_out(sending_instances, receiving_instances, output_rates, arrival_rates) return _convert_arrs_to_df(arrival_rates), strmgr_in_out
def transform_date_from_state(dt: datetime.datetime) -> str: dt_str = dt.isoformat() dt_str = re.sub("\+00:00$", "Z", dt_str) return dt_str
def _datetime_to_json(obj: datetime) -> JsonType: return obj.isoformat()
def dt_to_str(dt: datetime): return dt.isoformat()
def get_dt_jira_format(dt: datetime.datetime): dt_str = dt.isoformat(timespec='milliseconds') components = dt_str.split('+') dt_str = components[0] + '+0000' return dt_str
def serialize_datetime(when: dt.datetime) -> str: """Return a serialized datetime string.""" return when.isoformat(" ")
def datetime_to_serializable(dt: datetime.datetime) -> str: return dt.isoformat()
def prettydate(d: datetime.datetime) -> str: """Jinja filter to convert datetime object to pretty text.""" return TIME_FORMAT.format(ts=d.isoformat(), t=d.strftime(PRETTY_TIME_FORMAT))
def last_boot(self, value: datetime) -> None: """Set last boot datetime.""" self._data[ATTR_LAST_BOOT] = value.isoformat()
def to_str(self, time: datetime.datetime): return time.isoformat()
def convert_datetime(value: datetime.datetime): return bytearray(value.isoformat().encode())