def get_metadata(): stream_name = 'fill in your stream name' stream_metadata = Metadata() stream_metadata.set_name(stream_name).set_description("Sequence Aligment, Timestamp Correction and Decoding of MotionsenseHRV") \ .add_dataDescriptor( DataDescriptor().set_name("red").set_type("float").set_attribute("description", \ "Value of Red LED - PPG")) \ .add_dataDescriptor( \ DataDescriptor().set_name("infrared").set_type("float").set_attribute("description", \ "Value of Infrared LED - PPG")) \ .add_dataDescriptor( \ DataDescriptor().set_name("green").set_type("float").set_attribute("description", \ "Value of Green LED - PPG")) \ .add_dataDescriptor( \ DataDescriptor().set_name("aclx").set_type("float").set_attribute("description", \ "Wrist Accelerometer X-axis")) \ .add_dataDescriptor( \ DataDescriptor().set_name("acly").set_type("float").set_attribute("description", \ "Wrist Accelerometer Y-axis")) \ .add_dataDescriptor( \ DataDescriptor().set_name("aclz").set_type("float").set_attribute("description", \ "Wrist Accelerometer Z-axis")) \ .add_dataDescriptor( \ DataDescriptor().set_name("gyrox").set_type("float").set_attribute("description", \ "Wrist Gyroscope X-axis")) \ .add_dataDescriptor( \ DataDescriptor().set_name("gyroy").set_type("float").set_attribute("description", \ "Wrist Gyroscope Y-axis")) \ .add_dataDescriptor( \ DataDescriptor().set_name("gyroz").set_type("float").set_attribute("description", \ "Wrist Gyroscope Z-axis")).add_module( \ ModuleMetadata().set_name("cerebralcortex.algorithms.raw_byte_decode.motionsenseHRV.py").set_attribute("url", "hhtps://md2k.org").set_author( "Md Azim Ullah", "*****@*****.**")) return stream_metadata
def upload_stream_data(base_url: str, username: str, password: str, stream_name: str, data_file_path: str): """ Upload stream data to cerebralcortex storage using CC-ApiServer Args: base_url (str): base url of CerebralCortex-APIServer. For example, http://localhost/ username (str): username password (str): password of the user data_file_path (str): stream data file path that needs to be uploaded Raises: Exception: if stream data upload fails """ login_url = base_url + "api/v3/user/login" register_stream_url = base_url + "api/v3/stream/register" user_metadata = { "username": username, "password": password, "user_role": "demo-user", "user_metadata": { "key": "demo-md", "value": "demo-vmd" }, "user_settings": { "key": "string", "value": "string" } } metadata = Metadata().set_name(stream_name).set_description("mobile phone accelerometer sensor data.") \ .add_dataDescriptor( DataDescriptor().set_name("accelerometer_x").set_type("float").set_attribute("description", "acceleration minus gx on the x-axis")) \ .add_dataDescriptor( DataDescriptor().set_name("accelerometer_y").set_type("float").set_attribute("description", "acceleration minus gy on the y-axis")) \ .add_dataDescriptor( DataDescriptor().set_name("accelerometer_z").set_type("float").set_attribute("description", "acceleration minus gz on the z-axis")) \ .add_module( ModuleMetadata().set_name("cerebralcortex.streaming_operation.main").set_version("2.0.7").set_attribute("description", "data is collected using mcerebrum.").set_author( "test_user", "test_user@test_email.com")) stream_metadata = metadata.to_json() user_registration_url = base_url + "api/v3/user/register" client.register_user(url=user_registration_url, user_metadata=user_metadata) auth = client.login_user(login_url, username, password) status = client.register_stream(register_stream_url, auth.get("auth_token"), stream_metadata) stream_upload_url = base_url + "api/v3/stream/" + status.get("hash_id") result = client.upload_stream_data(stream_upload_url, auth.get("auth_token"), data_file_path) print(result)
def create_windows(self, window_length='hour'): """ filter data Args: columnName (str): name of the column operator (str): basic operators (e.g., >, <, ==, !=) value (Any): if the columnName is timestamp, please provide python datatime object Returns: DataStream: this will return a new datastream object with blank metadata """ windowed_df = self._data.withColumn('custom_window', windowing_udf('timestamp')) return DataStream(data=windowed_df, metadata=Metadata()) return DataStream(data=windowed_df, metadata=Metadata())
def get_stream_metadata_by_name(self, stream_name: str, version:int) -> Metadata: """ Get a list of metadata for all versions available for a stream. Args: stream_name (str): name of a stream version (int): version of a stream. Acceptable parameters are all, latest, or a specific version of a stream (e.g., 2.0) (Default="all") Returns: Metadata: Returns an empty list if no metadata is available for a stream_name or a list of metadata otherwise. Raises: ValueError: stream_name cannot be None or empty. Examples: >>> CC = CerebralCortex("/directory/path/of/configs/") >>> CC.list_users("mperf") >>> [Metadata] # list of MetaData class objects """ if stream_name is None or stream_name=="": raise ValueError("stream_name cannot be None or empty.") rows = self.session.query(Stream.stream_metadata).filter((Stream.name == stream_name) & (Stream.version==version) & (Stream.study_name==self.study_name)).first() if rows: return Metadata().from_json_file(rows.stream_metadata) else: return None
def mapInPandas(self, func, schema): """ Maps an iterator of batches in the current :class:`DataFrame` using a Python native function that takes and outputs a pandas DataFrame, and returns the result as a :class:`DataFrame`. The function should take an iterator of `pandas.DataFrame`\\s and return another iterator of `pandas.DataFrame`\\s. All columns are passed together as an iterator of `pandas.DataFrame`\\s to the function and the returned iterator of `pandas.DataFrame`\\s are combined as a :class:`DataFrame`. Each `pandas.DataFrame` size can be controlled by `spark.sql.execution.arrow.maxRecordsPerBatch`. Args: func: function a Python native function that takes an iterator of `pandas.DataFrame`, and outputs an iterator of `pandas.DataFrame`. schema: :class:`pyspark.sql.types.DataType` or str the return type of the `func` in PySpark. The value can be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. Returns: Examples: >>> def filter_func(iterator): ... for pdf in iterator: ... yield pdf[pdf.id == 1] >>> ds.mapInPandas(filter_func, ds.schema).show() """ return DataStream(data=self._data.mapInPandas(func=func, schema=schema), metadata=Metadata())
def applyInPandas(self, func, schema): """ The function should take a `pandas.DataFrame` and return another `pandas.DataFrame`. For each group, all columns are passed together as a `pandas.DataFrame` to the user-function and the returned `pandas.DataFrame` are combined as a `DataFrame`. The `schema` should be a `StructType` describing the schema of the returned `pandas.DataFrame`. The column labels of the returned `pandas.DataFrame` must either match the field names in the defined schema if specified as strings, or match the field data types by position if not strings, e.g. integer indices. The length of the returned `pandas.DataFrame` can be arbitrary. Args: func: a Python native function that takes a `pandas.DataFrame`, and outputs a `pandas.DataFrame`. schema: :class:`pyspark.sql.types.DataType` or str the return type of the `func` in PySpark. The value can be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. Returns: """ return DataStream(data=self._data.applyInPandas(func=func, schema=schema), metadata=Metadata())
def join_stress_streams(self, dataStream, propagation='forward'): """ filter data Args: columnName (str): name of the column operator (str): basic operators (e.g., >, <, ==, !=) value (Any): if the columnName is timestamp, please provide python datatime object Returns: DataStream: this will return a new datastream object with blank metadata """ combined_df = self._data.join( dataStream.data, on=['user', 'timestamp', 'localtime', 'version'], how='full').orderBy('timestamp') combined_filled = combined_df.withColumn( "data_quality", F.last('data_quality', True).over( Window.partitionBy('user').orderBy('timestamp').rowsBetween( -sys.maxsize, 0))) combined_filled_filtered = combined_filled.filter( combined_filled.ecg.isNotNull()) return DataStream(data=combined_filled_filtered, metadata=Metadata())
def compute(self, udfName, windowDuration: int = None, slideDuration: int = None, groupByColumnName: List[str] = [], startTime=None): """ Run an algorithm. This method supports running an udf method on windowed data Args: udfName: Name of the algorithm windowDuration (int): duration of a window in seconds slideDuration (int): slide duration of a window groupByColumnName List[str]: groupby column names, for example, groupby user, col1, col2 startTime (datetime): The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start window intervals. For example, in order to have hourly tumbling windows that start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide startTime as 15 minutes. First time of data will be used as startTime if none is provided Returns: DataStream: this will return a new datastream object with blank metadata """ if slideDuration: slideDuration = str(slideDuration) + " seconds" if 'custom_window' in self._data.columns: data = self._data.groupby('user', 'custom_window').apply(udfName) else: groupbycols = ["user", "version"] if windowDuration: windowDuration = str(windowDuration) + " seconds" win = F.window("timestamp", windowDuration=windowDuration, slideDuration=slideDuration, startTime=startTime) groupbycols.append(win) if len(groupByColumnName) > 0: groupbycols.extend(groupByColumnName) data = self._data.groupBy(groupbycols).apply(udfName) return DataStream(data=data, metadata=Metadata())
def ema_logs(ds): """ Convert json column to multiple columns. Args: ds (DataStream): Windowed/grouped DataStream object Returns: """ schema = StructType([ StructField("timestamp", TimestampType()), StructField("localtime", TimestampType()), StructField("user", StringType()), StructField("version", IntegerType()), StructField("status", StringType()), StructField("ema_id", StringType()), StructField("schedule_timestamp", TimestampType()), StructField("operation", StringType()) ]) @pandas_udf(schema, PandasUDFType.GROUPED_MAP) def parse_ema_logs(user_data): all_vals = [] for index, row in user_data.iterrows(): ema = row["log"] if not isinstance(ema, dict): ema = json.loads(ema) operation = ema["operation"].lower() if operation != "condition": status = ema.get("status", "") ema_id = ema["id"] schedule_timestamp = ema.get("logSchedule", {}).get("scheduleTimestamp") if schedule_timestamp: schedule_timestamp = pd.to_datetime(schedule_timestamp, unit='ms') all_vals.append([ row["timestamp"], row["localtime"], row["user"], 1, status, ema_id, schedule_timestamp, operation ]) return pd.DataFrame(all_vals, columns=[ 'timestamp', 'localtime', 'user', 'version', 'status', 'ema_id', 'schedule_timestamp', 'operation' ]) # check if datastream object contains grouped type of DataFrame if not isinstance(ds._data, GroupedData): raise Exception( "DataStream object is not grouped data type. Please use 'window' operation on datastream object before running this algorithm" ) data = ds._data.apply(parse_ema_logs) return DataStream(data=data, metadata=Metadata())
def interpolate(ds, freq=16, method='linear', axis=0, limit=None, inplace=False, limit_direction='forward', limit_area=None, downcast=None): """ Interpolate values according to different methods. This method internally uses pandas interpolation. Args: ds (DataStream): Windowed/grouped DataStream object freq (int): Frequency of the signal method (str): default ‘linear’ - ‘linear’: Ignore the index and treat the values as equally spaced. This is the only method supported on MultiIndexes. - ‘time’: Works on daily and higher resolution data to interpolate given length of interval. - ‘index’, ‘values’: use the actual numerical values of the index. - ‘pad’: Fill in NaNs using existing values. - ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’, ‘polynomial’: Passed to scipy.interpolate.interp1d. These methods use the numerical values of the index. Both ‘polynomial’ and ‘spline’ require that you also specify an order (int), e.g. df.interpolate(method='polynomial', order=5). - ‘krogh’, ‘piecewise_polynomial’, ‘spline’, ‘pchip’, ‘akima’: Wrappers around the SciPy interpolation methods of similar names. See Notes. - ‘from_derivatives’: Refers to scipy.interpolate.BPoly.from_derivatives which replaces ‘piecewise_polynomial’ interpolation method in scipy 0.18. axis {0 or ‘index’, 1 or ‘columns’, None}: default None. Axis to interpolate along. limit (int): optional. Maximum number of consecutive NaNs to fill. Must be greater than 0. inplace (bool): default False. Update the data in place if possible. limit_direction {‘forward’, ‘backward’, ‘both’}: default ‘forward’. If limit is specified, consecutive NaNs will be filled in this direction. limit_area {None, ‘inside’, ‘outside’}: default None. If limit is specified, consecutive NaNs will be filled with this restriction. - None: No fill restriction. - ‘inside’: Only fill NaNs surrounded by valid values (interpolate). - ‘outside’: Only fill NaNs outside valid values (extrapolate). downcast optional, ‘infer’ or None: defaults to None **kwargs: Keyword arguments to pass on to the interpolating function. Returns DataStream: interpolated data """ schema = ds._data.schema sample_freq = 1000 / freq @pandas_udf(schema, PandasUDFType.GROUPED_MAP) def interpolate_data(pdf): pdf.set_index("timestamp", inplace=True) pdf = pdf.resample(str(sample_freq) + "ms").bfill(limit=1).interpolate( method=method, axis=axis, limit=limit, inplace=inplace, limit_direction=limit_direction, limit_area=limit_area, downcast=downcast) pdf.ffill(inplace=True) pdf.reset_index(drop=False, inplace=True) pdf.sort_index(axis=1, inplace=True) return pdf data = ds._data.groupby(["user", "version"]).apply(interpolate_data) return DataStream(data=data, metadata=Metadata())
def get_metadata(): """ generate metadata for the stream Returns: MetaData object """ stream_metadata = Metadata() stream_metadata.set_name(stream_name).set_description("ECG RR interval in milliseconds") \ .add_input_stream(ecg_data.metadata.get_name()) \ .add_dataDescriptor( DataDescriptor().set_name("rr").set_type("float") \ .set_attribute("description","rr interval")) \ .add_module( ModuleMetadata().set_name("ecg rr interval") \ .set_attribute("url","http://md2k.org/") \ .set_attribute('algorithm','pan-tomkins').set_attribute('unit','ms').set_author("Md Azim Ullah", "*****@*****.**")) return stream_metadata
def get_metadata(): stream_metadata = Metadata() stream_metadata.set_name(stream_name).set_description("Stress episodes computed using MACD formula.") \ .add_input_stream(ecg_stress_probability.metadata.get_name()) \ .add_dataDescriptor(DataDescriptor().set_name("timestamp").set_type("datetime")) \ .add_dataDescriptor(DataDescriptor().set_name("localtime").set_type("datetime")) \ .add_dataDescriptor(DataDescriptor().set_name("version").set_type("int")) \ .add_dataDescriptor(DataDescriptor().set_name("user").set_type("string")) \ .add_dataDescriptor( DataDescriptor().set_name("stress_probability").set_type("float")) \ .add_dataDescriptor( DataDescriptor().set_name("stress_episode").set_type("string").set_attribute("description", \ "stress episodes calculated using MACD")) \ .add_module( ModuleMetadata().set_name("cerebralcortex.algorithm.stress_prediction.stress_episodes.compute_stress_episodes") .set_attribute("url", "http://md2k.org/").set_author( "Anandatirtha Nandugudi", "*****@*****.**")) return stream_metadata
def process_save_stream(msg: dict, cc_config_path: str): """ Process one of kafka messages, add gaussian noise to data and store data as a new stream Args: msg (dict): kafka message - {'filename': str, 'metadata_hash': str, "stream_name": str, "user_id": str} cc_config_path (str): path of cerebralcortex configs Notes: This method creates CC object again. This code is running on worker node. Thus, it won't have access to CC object created in run() CC object cannot be passed to worker nodes because it contains sockets and sockets cannot be serialized in spark to pass as a parameter """ # Disable pandas warnings warnings.simplefilter(action='ignore', category=FutureWarning) CC = Kernel(cc_config_path, enable_spark=False) cc_config = CC.config stream_name = msg.get("stream_name") user_id = msg.get("user_id") if cc_config["nosql_storage"] == "filesystem": file_name = str( cc_config["filesystem"]["filesystem_path"]) + msg.get("filename") elif cc_config["nosql_storage"] == "hdfs": file_name = str( cc_config["hdfs"]["raw_files_dir"]) + msg.get("filename") else: raise Exception( str(cc_config["nosql_storage"]) + " is not supported. Please use filesystem or hdfs.") if os.path.exists(file_name): data = pq.read_table(file_name) pdf = data.to_pandas() pdf = add_gaussian_noise(pdf) new_stream_name = stream_name + "_gaussian_noise" metadata = Metadata().set_name(new_stream_name).set_description("Gaussian noise added to the accel sensor stream.") \ .add_dataDescriptor( DataDescriptor().set_attribute("description", "noisy accel x")) \ .add_dataDescriptor( DataDescriptor().set_attribute("description", "noisy accel y")) \ .add_dataDescriptor( DataDescriptor().set_attribute("description", "noisy accel z")) \ .add_module( ModuleMetadata().set_name("cerebralcortex.streaming_operation.main").set_version("0.0.1").set_attribute("description", "Spark streaming example using CerebralCortex. This example adds gaussian noise to a stream data.").set_author( "test_user", "test_user@test_email.com")) pdf["user"] = user_id ds = DataStream(data=pdf, metadata=metadata) CC.save_stream(ds) else: print(file_name, "does not exist.")
def count_encounters_per_cluster(ds, multiplier=10): schema = StructType([ StructField('timestamp', TimestampType()), StructField('localtime', TimestampType()), StructField('version', IntegerType()), StructField('latitude', DoubleType()), StructField('longitude', DoubleType()), StructField('n_users', IntegerType()), StructField('total_encounters', DoubleType()), StructField('avg_encounters', DoubleType()), StructField('normalized_total_encounters', DoubleType()) ]) @pandas_udf(schema, PandasUDFType.GROUPED_MAP) def count_encounters(data): if data.shape[0] == 0: return pd.DataFrame([], columns=[ 'version', 'latitude', 'longitude', 'n_users', 'total_encounters', 'avg_encounters', 'timestamp', 'localtime' ]) data = data.sort_values('localtime').reset_index(drop=True) centroid_id = data['centroid_id'].iloc[0] centroid_latitude = data['centroid_latitude'].iloc[0] centroid_longitude = data['centroid_longitude'].iloc[0] unique_users = np.unique( list(data['user'].unique()) + list(data['participant_identifier'].unique())) data['count'] = 1 total_encounters = data.groupby( 'user', as_index=False).sum()['count'].sum() + data.groupby( 'participant_identifier', as_index=False).sum()['count'].sum() average_encounter = (total_encounters) / len(unique_users) total_encounters = data.shape[0] normalized_total_encounters = total_encounters * multiplier / data[ 'centroid_area'].iloc[0] timestamp = data['timestamp'].iloc[data.shape[0] // 2] localtime = data['localtime'].iloc[data.shape[0] // 2] version = data['version'].iloc[0] return pd.DataFrame([[ normalized_total_encounters, version, centroid_latitude, centroid_longitude, len(unique_users), total_encounters, average_encounter, timestamp, localtime ]], columns=[ 'normalized_total_encounters', 'version', 'latitude', 'longitude', 'n_users', 'total_encounters', 'avg_encounters', 'timestamp', 'localtime' ]) data = ds._data.groupBy(['centroid_id', 'version']).apply(count_encounters) return DataStream(data=data, metadata=Metadata())
def cogroup(self, other): """ Cogroups this group with another group so that we can run cogrouped operations. Returns: """ return DataStream(data=self._data.cogroup(other=other), metadata=Metadata())
def ema_incentive(ds): """ Parse stream name 'incentive--org.md2k.ema_scheduler--phone'. Convert json column to multiple columns. Args: ds: Windowed/grouped DataStream object Returns: ds: Windowed/grouped DataStream object. """ schema = StructType([ StructField("timestamp", TimestampType()), StructField("localtime", TimestampType()), StructField("user", StringType()), StructField("version", IntegerType()), StructField("incentive", FloatType()), StructField("total_incentive", FloatType()), StructField("ema_id", StringType()), StructField("data_quality", FloatType()) ]) @pandas_udf(schema, PandasUDFType.GROUPED_MAP) def parse_ema_incentive(user_data): all_vals = [] for index, row in user_data.iterrows(): ema = row["incentive"] if not isinstance(ema, dict): ema = json.loads(ema) incentive = ema["incentive"] total_incentive = ema["totalIncentive"] ema_id = ema["emaId"] data_quality = ema["dataQuality"] all_vals.append([ row["timestamp"], row["localtime"], row["user"], 1, incentive, total_incentive, ema_id, data_quality ]) return pd.DataFrame(all_vals, columns=[ 'timestamp', 'localtime', 'user', 'version', 'incentive', 'total_incentive', 'ema_id', 'data_quality' ]) # check if datastream object contains grouped type of DataFrame if not isinstance(ds._data, GroupedData): raise Exception( "DataStream object is not grouped data type. Please use 'window' operation on datastream object before running this algorithm" ) data = ds._data.apply(parse_ema_incentive) return DataStream(data=data, metadata=Metadata())
def get_metadata(): stream_metadata = Metadata() stream_metadata.set_name(stream_name).set_description("stress likelihood computed from ECG") \ .add_input_stream(stress_features_normalized.metadata.get_name()) \ .add_dataDescriptor( DataDescriptor().set_name("stress_probability") .set_type("double").set_attribute("description","stress likelihood computed from ECG only model") .set_attribute("threshold","0.47")) \ .add_dataDescriptor( DataDescriptor().set_name("window") .set_type("struct") .set_attribute("description", "window start and end time in UTC") .set_attribute('start', 'start of 1 minute window') .set_attribute('end','end of 1 minute window')) \ .add_module( ModuleMetadata().set_name("ECG Stress Model") .set_attribute("url", "http://md2k.org/") .set_attribute('algorithm','cStress') .set_attribute('unit','ms').set_author("Md Azim Ullah", "*****@*****.**")) return stream_metadata
def withColumn(self, colName, col): """ Returns a new DataStream by adding a column or replacing the existing column that has the same name. The column expression must be an expression over this DataStream; attempting to add a column from some other datastream will raise an error. Args: colName (str): name of the new column. col: a Column expression for the new column. Examples: >>> ds.withColumn('col_name', ds.col_name + 2) """ data = self._data.withColumn(colName=colName, col=col) return DataStream(data=data, metadata=Metadata())
def limit(self, num): """ Limits the result count to the number specified. Args: num: Returns: Datastream: """ data = self._data.limit(num=num) return DataStream(data=data, metadata=Metadata())
def orderBy(self, *cols): """ order by column name Args: *cols: Returns: Datastream: """ data = self._data.orderBy(*cols) return DataStream(data=data, metadata=Metadata())
def generate_metadata_encounter_daily(): stream_metadata = Metadata() stream_metadata.set_name('mcontain-md2k-encounter-daily--bluetooth-gps').set_description('Contains each unique encounters between two persons along with the location of encounter') \ .add_dataDescriptor( DataDescriptor().set_name("start_time").set_type("timestamp").set_attribute("description", \ "Start time of the encounter in localtime")) \ .add_dataDescriptor( DataDescriptor().set_name("end_time").set_type("timestamp").set_attribute("description", \ "End time of the encounter in localtime")) \ .add_dataDescriptor( DataDescriptor().set_name("participant_identifier").set_type("string").set_attribute("description", \ "Participant with whom encounter happened")) \ .add_dataDescriptor( DataDescriptor().set_name("os").set_type("string").set_attribute("description", \ "Operating system of the phone belonging to user")) \ .add_dataDescriptor( DataDescriptor().set_name("latitude").set_type("double").set_attribute("description", \ "Latitude of encounter location")) \ .add_dataDescriptor( DataDescriptor().set_name("longitude").set_type("double").set_attribute("description", \ "Longitude of encounter location")) \ .add_dataDescriptor( DataDescriptor().set_name("durations").set_type("array").set_attribute("description", \ "Mean distance between participants in encounter")) \ .add_dataDescriptor( DataDescriptor().set_name("covid").set_type("integer").set_attribute("description", \ "0, 1 or 2 indicating if this encounter contained a covid user -- 0 - no covid-19 affected, 1 - user is, 2 - participant identifier is")) stream_metadata.add_module( ModuleMetadata().set_name('Encounter computation after parsing raw bluetooth-gps data, clustering gps locations and removing double counting') \ .set_attribute("url", "https://mcontain.md2k.org").set_author( "Md Azim Ullah", "*****@*****.**")) return stream_metadata
def generate_metadata_notif(): stream_metadata = Metadata() stream_metadata.set_name('mcontain-md2k--user-notifications').set_description('Notification generated for the Covid-19 encountered users.') \ .add_dataDescriptor( DataDescriptor().set_name("user").set_type("string").set_attribute("description", \ "user id")) \ .add_dataDescriptor( DataDescriptor().set_name("timestamp").set_type("timestamp").set_attribute("description", \ "Unix timestamp when the message was generated")) \ .add_dataDescriptor( DataDescriptor().set_name("localtime").set_type("timestamp").set_attribute("description", \ "Local timestamp when the message was generated.")) \ .add_dataDescriptor( DataDescriptor().set_name("message").set_type("string").set_attribute("description", \ "Generated notification message")) \ .add_dataDescriptor( DataDescriptor().set_name("day").set_type("timestamp").set_attribute("description", \ "day of the encounter")) \ .add_dataDescriptor( DataDescriptor().set_name("version").set_type("int").set_attribute("description", \ "version")) stream_metadata.add_module( ModuleMetadata().set_name('Generated notification for a user encountered with Covid-19 participant') \ .set_attribute("url", "https://mcontain.md2k.org").set_author( "Md Shiplu Hawlader", "*****@*****.**").set_version(1)) return stream_metadata
def generate_metadata_hourly(): stream_metadata = Metadata() stream_metadata.set_name('mcontain-md2k--visualization-stats--time-window').set_description('Computes visualization stats every time window defined by start time and end time') \ .add_dataDescriptor( DataDescriptor().set_name("start_time").set_type("timestamp").set_attribute("description", \ "Start time of the time window localtime")) \ .add_dataDescriptor( DataDescriptor().set_name("end_time").set_type("timestamp").set_attribute("description", \ "End time of the time window in localtime")) \ .add_dataDescriptor( DataDescriptor().set_name("latitude").set_type("double").set_attribute("description", \ "Latitude of centroid location, a gps cluster output grouping encounters in similar location together")) \ .add_dataDescriptor( DataDescriptor().set_name("longitude").set_type("double").set_attribute("description", \ "Longitude of centroid location, a gps cluster output grouping encounters in similar location together")) \ .add_dataDescriptor( DataDescriptor().set_name("n_users").set_type("integer").set_attribute("description", \ "Number of unique users in that cluster centroid")) \ .add_dataDescriptor( DataDescriptor().set_name("total_encounters").set_type("double").set_attribute("description", \ "Total encounters happening in the time window in this specific location")) \ .add_dataDescriptor( DataDescriptor().set_name("normalized_total_encounters").set_type("double").set_attribute("description", \ "Total encounters normalized by the centroid area. (encounters per 10 square meter)")) \ .add_dataDescriptor( DataDescriptor().set_name("avg_encounters").set_type("double").set_attribute("description", \ "average encounter per participant(participants who had at least one encounter)")) stream_metadata.add_module( ModuleMetadata().set_name('Visualization stats computation in a time window between start time and end time') \ .set_attribute("url", "https://mcontain.md2k.org").set_author( "Md Azim Ullah", "*****@*****.**")) return stream_metadata
def generate_metadata_dailystats(): stream_metadata = Metadata() stream_metadata.set_name('mcontain-md2k--daily-stats').set_description('Daily stats for website') \ .add_dataDescriptor( DataDescriptor().set_name("start_time").set_type("timestamp").set_attribute("description", \ "Start time of the day in localtime")) \ .add_dataDescriptor( DataDescriptor().set_name("end_time").set_type("timestamp").set_attribute("description", \ "End time of the day in localtime")) \ .add_dataDescriptor( DataDescriptor().set_name("number_of_app_users").set_type("double").set_attribute("description", \ "Total number of app users")) \ .add_dataDescriptor( DataDescriptor().set_name("encounter_per_user").set_type("double").set_attribute("description", \ "Average encounter per user")) \ .add_dataDescriptor( DataDescriptor().set_name("total_covid_encounters").set_type("double").set_attribute("description", \ "Total covid encounters on the day")) \ .add_dataDescriptor( DataDescriptor().set_name("maximum_concurrent_encounters").set_type("double").set_attribute("description", \ "Maximum concurrent encounters")) stream_metadata.add_module( ModuleMetadata().set_name('Daily encounter stats for all the users to be shown in website') \ .set_attribute("url", "https://mcontain.md2k.org").set_author( "Md Azim Ullah", "*****@*****.**")) return stream_metadata
def combine_base_encounters(base_encounters, time_threshold=10 * 60): schema = StructType([ StructField('timestamp', TimestampType()), StructField('localtime', TimestampType()), StructField('start_time', TimestampType()), StructField('end_time', TimestampType()), StructField('user', StringType()), StructField('version', IntegerType()), StructField('latitude', DoubleType()), StructField('distances', ArrayType(DoubleType())), StructField('longitude', DoubleType()), StructField('average_count', DoubleType()), StructField('participant_identifier', StringType()), StructField('os', StringType()) ]) columns = [a.name for a in schema.fields] # print(columns) @pandas_udf(schema, PandasUDFType.GROUPED_MAP) def get_enconters(data): if data.shape[0] == 1: if (pd.Timestamp(data['end_time'].values[0]) - pd.Timestamp(data['start_time'].values[0]) ).total_seconds() < time_threshold: return pd.DataFrame([], columns=columns) return data[columns] data = data.sort_values('start_time').reset_index(drop=True) ts = data['timestamp'].astype('datetime64[ns]').quantile(.5) local_ts = data['localtime'].astype('datetime64[ns]').quantile(.5) st = data['start_time'].min() et = data['end_time'].max() if (pd.Timestamp(et) - pd.Timestamp(st)).total_seconds() < time_threshold: return pd.DataFrame([], columns=columns) user = data['user'].values[0] version = 1 latitude = data['latitude'].mean() longitude = data['longitude'].mean() distances = [] for i, row in data.iterrows(): distances.extend(list(row['distances'])) average_count = data['average_count'].mean() os = data['os'].values[0] participant_identifier = data['participant_identifier'].values[0] return pd.DataFrame([[ ts, local_ts, st, et, user, version, latitude, distances, longitude, average_count, participant_identifier, os ]], columns=columns) data_result = base_encounters.groupBy(['user', 'participant_identifier' ]).apply(get_enconters) return DataStream(data=data_result, metadata=Metadata())
def freqItems(self, cols, support=None): """ Finding frequent items for columns, possibly with false positives. Using the frequent element count algorithm described in “http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou”. Returns: DataStream: Examples: >>> ds.freqItems("col-name") """ data = self._data.freqItems(cols=cols,support=support) return DataStream(data=data, metadata=Metadata())
def distinct(self): """ Returns a new DataStream containing the distinct rows in this DataStream. Returns: DataStream: this will return a new datastream object with blank metadata Examples: >>> ds.distinct().count() """ data = self._data.distinct() return DataStream(data=data, metadata=Metadata())
def filter_user(self, user_ids: List): """ filter data to get only selective users' data Args: user_ids (List[str]): list of users' UUIDs Returns: DataStream: this will return a new datastream object with blank metadata """ if not isinstance(user_ids, list): user_ids = [user_ids] data = self._data.where(self._data["user"].isin(user_ids)) return DataStream(data=data, metadata=Metadata())
def colRegex(self,colName): """ Selects column based on the column name specified as a regex and returns it as Column. Args: colName (str): column name specified as a regex. Returns: DataStream: Examples: >>> ds.colRegex("colName") """ return DataStream(data=self._data.colRegex(colName=colName), metadata=Metadata())
def map_stream(self, window_ds): """ Map/join a stream to a windowed stream Args: window_ds (Datastream): windowed datastream object Returns: Datastream: joined/mapped stream """ window_ds = window_ds.data.drop("version", "user") df = window_ds.join(self.data, self.data.timestamp.between(F.col("window.start"), F.col("window.end"))) return DataStream(data=df, metadata=Metadata())