def save_point(user, value, start_time, end_time, offset, metadata, stream_name_suffix): dp = DataPoint(start_time=start_time, end_time=end_time, offset=offset, sample=[value]) metadata_name = metadata['name'] metadata_name = metadata_name + stream_name_suffix output_stream_id = str( uuid.uuid3(uuid.NAMESPACE_DNS, str(metadata_name + user + str(metadata)))) ds = DataStream(identifier=output_stream_id, owner=user, name=metadata_name, data_descriptor=metadata['data_descriptor'], execution_context=metadata['execution_context'], annotations=metadata['annotations'], stream_type=1, data=[dp]) #print(str(user),str(output_stream_id),len(feature_data[user])) try: CC.save_stream(ds, localtime=True) except Exception as e: print(e)
def filter_bad_ecg(ecg: DataStream, ecg_quality: DataStream) -> DataStream: """ This function combines the raw ecg and ecg data quality datastream and only keeps those datapoints that are assigned acceptable in data quality :param ecg: raw ecg datastream :param ecg_quality: ecg quality datastream :return: filtered ecg datastream """ ecg_filtered = DataStream.from_datastream([ecg]) ecg_quality_array = ecg_quality.data ecg_raw_timestamp_array = np.array( [i.start_time.timestamp() for i in ecg.data]) ecg_filtered_array = [] initial_index = 0 for item in ecg_quality_array: if item.sample == Quality.ACCEPTABLE: final_index = initial_index for i in range(initial_index, len(ecg.data)): if item.start_time.timestamp( ) <= ecg_raw_timestamp_array[i] <= item.end_time.timestamp(): ecg_filtered_array.append(ecg.data[i]) final_index = i initial_index = final_index ecg_filtered.data = ecg_filtered_array return ecg_filtered
def store(data: OrderedDict, input_streams: dict, output_streams: dict, metadata, CC_obj: CerebralCortex, config: dict): """ Store diagnostic results with its metadata in the data-store :param input_streams: :param data: :param CC_obj: :param config: :param algo_type: """ if data: # basic output stream info owner = input_streams[0]["owner_id"] dd_stream_id = output_streams["id"] dd_stream_name = output_streams["name"] stream_type = "ds" data_descriptor = metadata["dd"] execution_context = metadata["ec"] annotations = metadata["anno"] ds = DataStream(identifier=dd_stream_id, owner=owner, name=dd_stream_name, data_descriptor=data_descriptor, execution_context=execution_context, annotations=annotations, stream_type=stream_type, data=data) CC_obj.save_datastream(ds, "datastream")
def store(self, identifier, owner, name, data_descriptor, execution_context, annotations, stream_type=StreamTypes.DATASTREAM, data=None, localtime=True): ''' All store operations MUST be through this method. ''' if not data: self.CC.logging.log(error_type=LogTypes.MISSING_DATA, error_message = 'Null data received for ' 'saving stream from ' + self.__class__.__name__) return ds = DataStream(identifier=identifier, owner=owner, name=name, data_descriptor=data_descriptor, execution_context=execution_context, annotations=annotations, stream_type=stream_type, data=data) try: self.CC.save_stream(datastream=ds, localtime=localtime) self.CC.logging.log('Saved %d data points stream id %s user_id ' '%s from %s' % (len(data), str(identifier), str(owner), self.__class__.__name__)) except Exception as exp: self.CC.logging.log(self.__class__.__name__ + str(exp) + "\n" + str(traceback.format_exc()))
def autosense_sequence_align(datastreams: List[DataStream], sampling_frequency: float) -> DataStream: result = DataStream.from_datastream(input_streams=datastreams) result.data = [] if len(datastreams) == 0: return result start_time = None for ds in datastreams: ts = ds.data[0].start_time if not start_time: start_time = ts elif start_time < ts: start_time = ts start_time -= datetime.timedelta(seconds=1.0 / sampling_frequency) data_block = [] max_index = np.Inf for ds in datastreams: d = [i for i in ds.data if i.start_time > start_time] if len(d) < max_index: max_index = len(d) data_block.append(d) data_array = np.array(data_block) dimensions = data_array.shape[0] for i in range(0, max_index): sample = [data_array[d][i].sample for d in range(0, dimensions)] result.data.append(DataPoint.from_tuple(data_array[0][i].start_time, sample)) return result
def filter_bad_rip(rip: DataStream, rip_quality: DataStream) -> DataStream: """ This function combines the raw rip and rip data quality datastream and only keeps those DataPoints that are assigned acceptable in data quality :param rip: raw respiration datastream :param rip_quality: respiration quality datastream :return: filtered respiration datastream """ rip_filtered = DataStream.from_datastream([rip]) rip_quality_array = rip_quality.data rip_raw_timestamp_array = np.array( [i.start_time.timestamp() for i in rip.data]) rip_filtered_array = [] initial_index = 0 for item in rip_quality_array: if item.sample == Quality.ACCEPTABLE: final_index = initial_index for i in range(initial_index, len(rip.data)): if item.start_time.timestamp( ) <= rip_raw_timestamp_array[i] <= item.end_time.timestamp(): rip_filtered_array.append(rip.data[i]) final_index = i initial_index = final_index rip_filtered.data = rip_filtered_array return rip_filtered
def interpolate(ds, freq=16, method='linear', axis=0, limit=None, inplace=False, limit_direction='forward', limit_area=None, downcast=None): """ Interpolate values according to different methods. This method internally uses pandas interpolation. Args: ds (DataStream): Windowed/grouped DataStream object freq (int): Frequency of the signal method (str): default ‘linear’ - ‘linear’: Ignore the index and treat the values as equally spaced. This is the only method supported on MultiIndexes. - ‘time’: Works on daily and higher resolution data to interpolate given length of interval. - ‘index’, ‘values’: use the actual numerical values of the index. - ‘pad’: Fill in NaNs using existing values. - ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’, ‘polynomial’: Passed to scipy.interpolate.interp1d. These methods use the numerical values of the index. Both ‘polynomial’ and ‘spline’ require that you also specify an order (int), e.g. df.interpolate(method='polynomial', order=5). - ‘krogh’, ‘piecewise_polynomial’, ‘spline’, ‘pchip’, ‘akima’: Wrappers around the SciPy interpolation methods of similar names. See Notes. - ‘from_derivatives’: Refers to scipy.interpolate.BPoly.from_derivatives which replaces ‘piecewise_polynomial’ interpolation method in scipy 0.18. axis {0 or ‘index’, 1 or ‘columns’, None}: default None. Axis to interpolate along. limit (int): optional. Maximum number of consecutive NaNs to fill. Must be greater than 0. inplace (bool): default False. Update the data in place if possible. limit_direction {‘forward’, ‘backward’, ‘both’}: default ‘forward’. If limit is specified, consecutive NaNs will be filled in this direction. limit_area {None, ‘inside’, ‘outside’}: default None. If limit is specified, consecutive NaNs will be filled with this restriction. - None: No fill restriction. - ‘inside’: Only fill NaNs surrounded by valid values (interpolate). - ‘outside’: Only fill NaNs outside valid values (extrapolate). downcast optional, ‘infer’ or None: defaults to None **kwargs: Keyword arguments to pass on to the interpolating function. Returns DataStream: interpolated data """ schema = ds._data.schema sample_freq = 1000 / freq @pandas_udf(schema, PandasUDFType.GROUPED_MAP) def interpolate_data(pdf): pdf.set_index("timestamp", inplace=True) pdf = pdf.resample(str(sample_freq) + "ms").bfill(limit=1).interpolate( method=method, axis=axis, limit=limit, inplace=inplace, limit_direction=limit_direction, limit_area=limit_area, downcast=downcast) pdf.ffill(inplace=True) pdf.reset_index(drop=False, inplace=True) pdf.sort_index(axis=1, inplace=True) return pdf data = ds._data.groupby(["user", "version"]).apply(interpolate_data) return DataStream(data=data, metadata=Metadata())
def accelerometer_features( accel: DataStream, window_length: float = 10.0, activity_threshold: float = 0.21, percentile_low: int = 1, percentile_high: int = 99 ) -> Tuple[DataStream, DataStream, DataStream]: """ References: Figure 3: http://www.cs.memphis.edu/~santosh/Papers/Timing-JIT-UbiComp-2014.pdf :param percentile_high: :param percentile_low: :param accel: :param window_length: :param activity_threshold: :return: """ accelerometer_magnitude = magnitude(normalize(accel)) accelerometer_win_mag_deviations_data = [] for key, data in window(accelerometer_magnitude.data, window_length).items(): accelerometer_win_mag_deviations_data.append( window_std_dev(data, key[0])) accelerometer_win_mag_deviations = DataStream.from_datastream([accel]) accelerometer_win_mag_deviations.data = accelerometer_win_mag_deviations_data am_values = np.array([dp.sample for dp in accelerometer_magnitude.data]) low_limit = np.percentile(am_values, percentile_low) high_limit = np.percentile(am_values, percentile_high) range = high_limit - low_limit accel_activity_data = [] for dp in accelerometer_win_mag_deviations_data: comparison = dp.sample > (low_limit + activity_threshold * range) accel_activity_data.append( DataPoint.from_tuple(dp.start_time, comparison)) accel_activity = DataStream.from_datastream([accel]) accel_activity.data = accel_activity_data return accelerometer_magnitude, accelerometer_win_mag_deviations, accel_activity
def process_save_stream(msg: dict, cc_config_path: str): """ Process one of kafka messages, add gaussian noise to data and store data as a new stream Args: msg (dict): kafka message - {'filename': str, 'metadata_hash': str, "stream_name": str, "user_id": str} cc_config_path (str): path of cerebralcortex configs Notes: This method creates CC object again. This code is running on worker node. Thus, it won't have access to CC object created in run() CC object cannot be passed to worker nodes because it contains sockets and sockets cannot be serialized in spark to pass as a parameter """ # Disable pandas warnings warnings.simplefilter(action='ignore', category=FutureWarning) CC = Kernel(cc_config_path, enable_spark=False) cc_config = CC.config stream_name = msg.get("stream_name") user_id = msg.get("user_id") if cc_config["nosql_storage"] == "filesystem": file_name = str( cc_config["filesystem"]["filesystem_path"]) + msg.get("filename") elif cc_config["nosql_storage"] == "hdfs": file_name = str( cc_config["hdfs"]["raw_files_dir"]) + msg.get("filename") else: raise Exception( str(cc_config["nosql_storage"]) + " is not supported. Please use filesystem or hdfs.") if os.path.exists(file_name): data = pq.read_table(file_name) pdf = data.to_pandas() pdf = add_gaussian_noise(pdf) new_stream_name = stream_name + "_gaussian_noise" metadata = Metadata().set_name(new_stream_name).set_description("Gaussian noise added to the accel sensor stream.") \ .add_dataDescriptor( DataDescriptor().set_attribute("description", "noisy accel x")) \ .add_dataDescriptor( DataDescriptor().set_attribute("description", "noisy accel y")) \ .add_dataDescriptor( DataDescriptor().set_attribute("description", "noisy accel z")) \ .add_module( ModuleMetadata().set_name("cerebralcortex.streaming_operation.main").set_version("0.0.1").set_attribute("description", "Spark streaming example using CerebralCortex. This example adds gaussian noise to a stream data.").set_author( "test_user", "test_user@test_email.com")) pdf["user"] = user_id ds = DataStream(data=pdf, metadata=metadata) CC.save_stream(ds) else: print(file_name, "does not exist.")
def ecg_data_quality(datastream: DataStream, window_size: float = 2.0, acceptable_outlier_percent: float = .34, outlier_threshold_high: float = .9769, outlier_threshold_low: float = .004884, ecg_threshold_band_loose: float = .01148, ecg_threshold_slope: float = .02443, buffer_length: int = 3) -> DataStream: """ :param datastream: Input ECG datastream :param window_size: Window size specifying the number of seconds the datastream is divided to check for data quality :param acceptable_outlier_percent: The acceptable outlier percentage in a window default is 34 percent :param outlier_threshold_high: The percentage of ADC range above which any value is considered an outlier :param outlier_threshold_low: The percentage of ADC range below which any value is considered an outlier :param ecg_threshold_band_loose: The Band Loose Threshold for ECG signal expressed in the percentage of ADC range :param ecg_threshold_slope: The Slope threshold of ECG signal- No consecutive DataPoints can have this difference in values(expressed as percentage of ADC range) :param buffer_length: This specifies the memory of the data quality computation. Meaning this number of past windows will also have a role to decide the quality of the current window :return: An Annotated Datastream of ECG Data quality specifying the time ranges when data quality was acceptable/non-acceptable """ ecg_quality_stream = DataStream.from_datastream(input_streams=[datastream]) window_data = window(datastream.data, window_size=window_size) ecg_quality = [] ecg_range = [] for key, data in window_data.items(): if len(data) > 0: result = compute_data_quality(data, ecg_range, True, ecg_threshold_band_loose, ecg_threshold_slope, acceptable_outlier_percent, outlier_threshold_high, outlier_threshold_low, buffer_length) if not ecg_quality: ecg_quality.append( DataPoint.from_tuple(data[0].start_time, result, data[-1].start_time)) else: if ecg_quality[-1].sample == result: new_point = DataPoint.from_tuple( ecg_quality[-1].start_time, result, data[-1].start_time) ecg_quality[-1] = new_point else: ecg_quality.append( DataPoint.from_tuple(data[0].start_time, result, data[-1].start_time)) ecg_quality_stream.data = ecg_quality return ecg_quality_stream
def compute_outlier_ecg(ecg_rr: DataStream) -> DataStream: """ Reference - Berntson, Gary G., et al. "An approach to artifact identification: Application to heart period data." Psychophysiology 27.5 (1990): 586-598. :param ecg_rr: RR interval datastream :return: An annotated datastream specifying when the ECG RR interval datastream is acceptable """ ecg_rr_outlier_stream = DataStream.from_datastream(input_streams=[ecg_rr]) if not ecg_rr.data: ecg_rr_outlier_stream.data = [] return ecg_rr_outlier_stream valid_rr_interval_sample = [ i.sample for i in ecg_rr.data if i.sample > .3 and i.sample < 2 ] valid_rr_interval_time = [ i.start_time for i in ecg_rr.data if i.sample > .3 and i.sample < 2 ] valid_rr_interval_difference = abs(np.diff(valid_rr_interval_sample)) # Maximum Expected Difference(MED)= 3.32* Quartile Deviation maximum_expected_difference = 4.5 * 0.5 * iqr(valid_rr_interval_difference) # Shortest Expected Beat(SEB) = Median Beat – 2.9 * Quartile Deviation # Minimal Artifact Difference(MAD) = SEB/ 3 maximum_artifact_difference = (np.median(valid_rr_interval_sample) - 2.9 * .5 * iqr(valid_rr_interval_difference)) / 3 # Midway between MED and MAD is considered criterion_beat_difference = (maximum_expected_difference + maximum_artifact_difference) / 2 if criterion_beat_difference < .2: criterion_beat_difference = .2 ecg_rr_quality_array = [ DataPoint.from_tuple(valid_rr_interval_time[0], Quality.ACCEPTABLE, valid_rr_interval_time[0]) ] for data in outlier_computation(valid_rr_interval_time, valid_rr_interval_sample, criterion_beat_difference): if ecg_rr_quality_array[-1].sample == data.sample: new_point = DataPoint.from_tuple( ecg_rr_quality_array[-1].start_time, data.sample, data.start_time) ecg_rr_quality_array[-1] = new_point else: ecg_rr_quality_array.append(data) ecg_rr_outlier_stream.data = ecg_rr_quality_array return ecg_rr_outlier_stream
def setUpClass(self): self.pp = PhoneFeatures() self.data = [] for t in range(10, 1, -1): currentTime = datetime.datetime.now() self.data.append( DataPoint(currentTime - datetime.timedelta(hours=t - .1), currentTime - datetime.timedelta(hours=t - .9), t)) ownerUUID = uuid.uuid4() phonedata = [] self.phoneDataStream = DataStream(identifier=uuid.uuid4(), owner=ownerUUID) self.phoneDataStream.data = phonedata smsdata = [] self.smsDataStream = DataStream(identifier=uuid.uuid4(), owner=ownerUUID) self.smsDataStream.data = smsdata
def save(identifier, owner, name, data_descriptor, execution_context, annotations, stream_type, data): ds = DataStream(identifier=identifier, owner=owner, name=name, data_descriptor=data_descriptor, execution_context=execution_context, annotations=annotations, stream_type=stream_type, data=data) try: CC.save_stream(ds) print("Saved %d data points"%(len(data))) except Exception as e: print(traceback.format_exc())
def magnitude(datastream: DataStream) -> DataStream: """ :param datastream: :return: """ result = DataStream.from_datastream(input_streams=[datastream]) if datastream.data is None or len(datastream.data) == 0: result.data = [] return result input_data = np.array([i.sample for i in datastream.data]) data = norm(input_data, axis=1).tolist() result.data = [DataPoint.from_tuple(start_time=v.start_time, sample=data[i]) for i, v in enumerate(datastream.data)] return result
def complementary_filter(ds, freq: int = 16, accelerometer_x: str = "accelerometer_x", accelerometer_y: str = "accelerometer_y", accelerometer_z: str = "accelerometer_z", gyroscope_x: str = "gyroscope_x", gyroscope_y: str = "gyroscope_y", gyroscope_z: str = "gyroscope_z"): """ Compute complementary filter on gyro and accel data. Args: ds (DataStream ): Non-Windowed/grouped dataframe freq (int): frequency of accel/gryo. Assumption is that frequency is equal for both gyro and accel. accelerometer_x (str): name of the column accelerometer_y (str): name of the column accelerometer_z (str): name of the column gyroscope_x (str): name of the column gyroscope_y (str): name of the column gyroscope_z (str): name of the column """ dt = 1.0 / freq # 1/16.0; M_PI = math.pi; hpf = 0.90; lpf = 0.10; window = Window.partitionBy(ds._data['user']).orderBy(ds._data['timestamp']) data = ds._data.withColumn("thetaX_accel", ((F.atan2(-F.col(accelerometer_z), F.col(accelerometer_y)) * 180 / M_PI)) * lpf) \ .withColumn("roll", (F.lag("thetaX_accel").over(window) + F.col(gyroscope_x) * dt) * hpf + F.col("thetaX_accel")).drop( "thetaX_accel") \ .withColumn("thetaY_accel", ((F.atan2(-F.col(accelerometer_x), F.col(accelerometer_z)) * 180 / M_PI)) * lpf) \ .withColumn("pitch", (F.lag("thetaY_accel").over(window) + F.col(gyroscope_y) * dt) * hpf + F.col("thetaY_accel")).drop( "thetaY_accel") \ .withColumn("thetaZ_accel", ((F.atan2(-F.col(accelerometer_y), F.col(accelerometer_x)) * 180 / M_PI)) * lpf) \ .withColumn("yaw", (F.lag("thetaZ_accel").over(window) + F.col(gyroscope_z) * dt) * hpf + F.col("thetaZ_accel")).drop( "thetaZ_accel") return DataStream(data=data.dropna(), metadata=Metadata())
def magnitude(ds, col_names=[]): """ Compute magnitude of columns Args: ds (DataStream): Windowed/grouped DataStream object col_names (list[str]): column names Returns: DataStream """ if len(col_names) < 1: raise Exception("col_names param cannot be empty list.") tmp = "" for col_name in col_names: tmp += 'F.col("' + col_name + '")*F.col("' + col_name + '")+' tmp = tmp.rstrip("+") data = ds._data.withColumn("magnitude", F.sqrt(eval(tmp))) return DataStream(data=data, metadata=Metadata())
def json_to_datastream(json_obj, stream_type): data = json_obj["data"] metadata = json_obj["metadata"] identifier = metadata["identifier"] owner = metadata["owner"] name = metadata["name"] data_descriptor = metadata["data_descriptor"] execution_context = metadata["execution_context"] annotations = metadata["annotations"] stream_type = stream_type start_time = data[0]["starttime"] end_time = data[len(data) - 1]["starttime"] datapoints = list(map(json_to_datapoints, data)) return DataStream(identifier, owner, name, data_descriptor, execution_context, annotations, stream_type, start_time, end_time, datapoints)
def compute_FFT_features(ds, exclude_col_names: list = [], feature_names=["fft_centroid", 'fft_spread', 'spectral_entropy', 'fft_flux', 'spectral_falloff']): """ Transforms data from time domain to frequency domain. Args: exclude_col_names list(str): name of the columns on which features should not be computed feature_names list(str): names of the features. Supported features are fft_centroid, fft_spread, spectral_entropy, spectral_entropy_old, fft_flux, spectral_falloff windowDuration (int): duration of a window in seconds slideDuration (int): slide duration of a window groupByColumnName List[str]: groupby column names, for example, groupby user, col1, col2 startTime (datetime): The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start window intervals. For example, in order to have hourly tumbling windows that start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide startTime as 15 minutes. First time of data will be used as startTime if none is provided Returns: DataStream object with all the existing data columns and FFT features """ eps = 0.00000001 exclude_col_names.extend(["timestamp", "localtime", "user", "version"]) data = ds._data.drop(*exclude_col_names) df_column_names = data.columns basic_schema = StructType([ StructField("timestamp", TimestampType()), StructField("localtime", TimestampType()), StructField("user", StringType()), StructField("version", IntegerType()), StructField("start_time", TimestampType()), StructField("end_time", TimestampType()) ]) features_list = [] for cn in df_column_names: for sf in feature_names: features_list.append(StructField(cn + "_" + sf, FloatType(), True)) features_schema = StructType(basic_schema.fields + features_list) def stSpectralCentroidAndSpread(X, fs): """Computes spectral centroid of frame (given abs(FFT))""" ind = (np.arange(1, len(X) + 1)) * (fs / (2.0 * len(X))) Xt = X.copy() Xt = Xt / Xt.max() NUM = np.sum(ind * Xt) DEN = np.sum(Xt) + eps # Centroid: C = (NUM / DEN) # Spread: S = np.sqrt(np.sum(((ind - C) ** 2) * Xt) / DEN) # Normalize: C = C / (fs / 2.0) S = S / (fs / 2.0) return (C, S) def stSpectralFlux(X, Xprev): """ Computes the spectral flux feature of the current frame ARGUMENTS: X: the abs(fft) of the current frame Xpre: the abs(fft) of the previous frame """ # compute the spectral flux as the sum of square distances: sumX = np.sum(X + eps) sumPrevX = np.sum(Xprev + eps) F = np.sum((X / sumX - Xprev / sumPrevX) ** 2) return F def stSpectralRollOff(X, c, fs): """Computes spectral roll-off""" totalEnergy = np.sum(X ** 2) fftLength = len(X) Thres = c * totalEnergy # Ffind the spectral rolloff as the frequency position where the respective spectral energy is equal to c*totalEnergy CumSum = np.cumsum(X ** 2) + eps [a, ] = np.nonzero(CumSum > Thres) if len(a) > 0: mC = np.float64(a[0]) / (float(fftLength)) else: mC = 0.0 return (mC) def stSpectralEntropy(X, numOfShortBlocks=10): """Computes the spectral entropy""" L = len(X) # number of frame samples Eol = np.sum(X ** 2) # total spectral energy subWinLength = int(np.floor(L / numOfShortBlocks)) # length of sub-frame if L != subWinLength * numOfShortBlocks: X = X[0:subWinLength * numOfShortBlocks] subWindows = X.reshape(subWinLength, numOfShortBlocks, order='F').copy() # define sub-frames (using matrix reshape) s = np.sum(subWindows ** 2, axis=0) / (Eol + eps) # compute spectral sub-energies En = -np.sum(s * np.log2(s + eps)) # compute spectral entropy return En def spectral_entropy(data, sampling_freq, bands=None): psd = np.abs(np.fft.rfft(data)) ** 2 psd /= np.sum(psd) # psd as a pdf (normalised to one) if bands is None: power_per_band = psd[psd > 0] else: freqs = np.fft.rfftfreq(data.size, 1 / float(sampling_freq)) bands = np.asarray(bands) freq_limits_low = np.concatenate([[0.0], bands]) freq_limits_up = np.concatenate([bands, [np.Inf]]) power_per_band = [np.sum(psd[np.bitwise_and(freqs >= low, freqs < up)]) for low, up in zip(freq_limits_low, freq_limits_up)] power_per_band = power_per_band[power_per_band > 0] return -np.sum(power_per_band * np.log2(power_per_band)) def fourier_features_pandas_udf(data, frequency: float = 16.0): Fs = frequency # the sampling freq (in Hz) results = [] # fourier transforms! # data_fft = abs(np.fft.rfft(data)) X = abs(np.fft.fft(data)) nFFT = int(len(X) / 2) + 1 X = X[0:nFFT] # normalize fft X = X / len(X) if "fft_centroid" or "fft_spread" in feature_names: C, S = stSpectralCentroidAndSpread(X, Fs) # spectral centroid and spread if "fft_centroid" in feature_names: results.append(C) if "fft_spread" in feature_names: results.append(S) if "spectral_entropy" in feature_names: se = stSpectralEntropy(X) # spectral entropy results.append(se) if "spectral_entropy_old" in feature_names: se_old = spectral_entropy(X, frequency) # spectral flux results.append(se_old) if "fft_flux" in feature_names: flx = stSpectralFlux(X, X.copy()) # spectral flux results.append(flx) if "spectral_folloff" in feature_names: roff = stSpectralRollOff(X, 0.90, frequency) # spectral rolloff results.append(roff) return pd.Series(results) @pandas_udf(features_schema, PandasUDFType.GROUPED_MAP) def get_fft_features(df): timestamp = df['timestamp'].iloc[0] localtime = df['localtime'].iloc[0] user = df['user'].iloc[0] version = df['version'].iloc[0] start_time = timestamp end_time = df['timestamp'].iloc[-1] df.drop(exclude_col_names, axis=1, inplace=True) df_ff = df.apply(fourier_features_pandas_udf) df3 = df_ff.T pd.set_option('display.max_colwidth', -1) df3.columns = feature_names # multiple rows to one row output = df3.unstack().to_frame().sort_index(level=1).T output.columns = [f'{j}_{i}' for i, j in output.columns] basic_df = pd.DataFrame([[timestamp, localtime, user, int(version), start_time, end_time]], columns=['timestamp', 'localtime', 'user', 'version', 'start_time', 'end_time']) # df.insert(loc=0, columns=, value=basic_cols) return basic_df.assign(**output) # check if datastream object contains grouped type of DataFrame if not isinstance(ds._data, GroupedData): raise Exception( "DataStream object is not grouped data type. Please use 'window' operation on datastream object before running this algorithm") data = ds._data.apply(get_fft_features) return DataStream(data=data, metadata=Metadata())
def statistical_features(ds, exclude_col_names: list = [], feature_names=[ 'mean', 'median', 'stddev', 'variance', 'max', 'min', 'skew', 'kurt', 'sqr' ]): """ Compute statistical features. Args: ds (DataStream): Windowed/grouped DataStream object exclude_col_names list(str): name of the columns on which features should not be computed feature_names list(str): names of the features. Supported features are ['mean', 'median', 'stddev', 'variance', 'max', 'min', 'skew', 'kurt', 'sqr', 'zero_cross_rate' Returns: DataStream object with all the existing data columns and FFT features """ exclude_col_names.extend(["timestamp", "localtime", "user", "version"]) data = ds._data._df.drop(*exclude_col_names) df_column_names = data.columns basic_schema = StructType([ StructField("timestamp", TimestampType()), StructField("localtime", TimestampType()), StructField("user", StringType()), StructField("version", IntegerType()), StructField("start_time", TimestampType()), StructField("end_time", TimestampType()) ]) features_list = [] for cn in df_column_names: for sf in feature_names: features_list.append(StructField(cn + "_" + sf, FloatType(), True)) features_schema = StructType(basic_schema.fields + features_list) def calculate_zero_cross_rate(series): """ How often the signal changes sign (+/-) """ series_mean = np.mean(series) series = [v - series_mean for v in series] zero_cross_count = (np.diff(np.sign(series)) != 0).sum() return zero_cross_count / len(series) def get_sqr(series): sqr = np.mean([v * v for v in series]) return sqr @pandas_udf(features_schema, PandasUDFType.GROUPED_MAP) def get_stats_features_udf(df): results = [] timestamp = df['timestamp'].iloc[0] localtime = df['localtime'].iloc[0] user = df['user'].iloc[0] version = df['version'].iloc[0] start_time = timestamp end_time = df['timestamp'].iloc[-1] df.drop(exclude_col_names, axis=1, inplace=True) if "mean" in feature_names: df_mean = df.mean() df_mean.index += '_mean' results.append(df_mean) if "median" in feature_names: df_median = df.median() df_median.index += '_median' results.append(df_median) if "stddev" in feature_names: df_stddev = df.std() df_stddev.index += '_stddev' results.append(df_stddev) if "variance" in feature_names: df_var = df.var() df_var.index += '_variance' results.append(df_var) if "max" in feature_names: df_max = df.max() df_max.index += '_max' results.append(df_max) if "min" in feature_names: df_min = df.min() df_min.index += '_min' results.append(df_min) if "skew" in feature_names: df_skew = df.skew() df_skew.index += '_skew' results.append(df_skew) if "kurt" in feature_names: df_kurt = df.kurt() df_kurt.index += '_kurt' results.append(df_kurt) if "sqr" in feature_names: df_sqr = df.apply(get_sqr) df_sqr.index += '_sqr' results.append(df_sqr) output = pd.DataFrame(pd.concat(results)).T basic_df = pd.DataFrame( [[timestamp, localtime, user, int(version), start_time, end_time]], columns=[ 'timestamp', 'localtime', 'user', 'version', 'start_time', 'end_time' ]) return basic_df.assign(**output) # check if datastream object contains grouped type of DataFrame if not isinstance(ds._data, GroupedData): raise Exception( "DataStream object is not grouped data type. Please use 'window' operation on datastream object before running this algorithm" ) data = ds._data.apply(get_stats_features_udf) return DataStream(data=data, metadata=Metadata())
def compute_peak_valley( rip: DataStream, rip_quality: DataStream, fs: float = 21.33, smoothing_factor: int = 5, time_window: int = 8, expiration_amplitude_threshold_perc: float = 0.10, threshold_expiration_duration: float = 0.312, inspiration_amplitude_threshold_perc: float = 0.10, max_amplitude_change_peak_correction: float = 30, min_neg_slope_count_peak_correction: int = 4, minimum_peak_to_valley_time_diff=0.31) -> [DataStream, DataStream]: """ Compute peak and valley from rip data and filter peak and valley. :param minimum_peak_to_valley_time_diff: :param inspiration_amplitude_threshold_perc: :param smoothing_factor: :return peak_datastream, valley_datastream: :param rip: :param rip_quality: :param fs: :param time_window: :param expiration_amplitude_threshold_perc: :param threshold_expiration_duration: :param max_amplitude_change_peak_correction: :param min_neg_slope_count_peak_correction: """ rip_filtered = filter_bad_rip(rip=rip, rip_quality=rip_quality) data_smooth = smooth(data=rip_filtered.data, span=smoothing_factor) window_length = int(round(time_window * fs)) data_mac = moving_average_curve(data_smooth, window_length=window_length) data_smooth_start_time_to_index = {} for index, data in enumerate(data_smooth): data_smooth_start_time_to_index[data.start_time] = index up_intercepts, down_intercepts = up_down_intercepts( data=data_smooth, mac=data_mac, data_start_time_to_index=data_smooth_start_time_to_index) up_intercepts_filtered, down_intercepts_filtered = filter_intercept_outlier( up_intercepts=up_intercepts, down_intercepts=down_intercepts) peaks, valleys = generate_peak_valley( up_intercepts=up_intercepts_filtered, down_intercepts=down_intercepts_filtered, data=data_smooth) valleys_corrected = correct_valley_position( peaks=peaks, valleys=valleys, up_intercepts=up_intercepts_filtered, data=data_smooth, data_start_time_to_index=data_smooth_start_time_to_index) peaks_corrected = correct_peak_position( peaks=peaks, valleys=valleys_corrected, up_intercepts=up_intercepts_filtered, data=data_smooth, max_amplitude_change_peak_correction= max_amplitude_change_peak_correction, min_neg_slope_count_peak_correction=min_neg_slope_count_peak_correction, data_start_time_to_index=data_smooth_start_time_to_index) # remove too close valley peak pair. peaks_filtered_close, valleys_filtered_close = remove_close_valley_peak_pair( peaks=peaks_corrected, valleys=valleys_corrected, minimum_peak_to_valley_time_diff=minimum_peak_to_valley_time_diff) # Remove small Expiration duration < 0.31 peaks_filtered_exp_dur, valleys_filtered_exp_dur = filter_expiration_duration_outlier( peaks=peaks_filtered_close, valleys=valleys_filtered_close, threshold_expiration_duration=threshold_expiration_duration) # filter out peak valley pair of inspiration of small amplitude. peaks_filtered_insp_amp, valleys_filtered_insp_amp = filter_small_amp_inspiration_peak_valley( peaks=peaks_filtered_exp_dur, valleys=valleys_filtered_exp_dur, inspiration_amplitude_threshold_perc= inspiration_amplitude_threshold_perc) # filter out peak valley pair of expiration of small amplitude. peaks_filtered_exp_amp, valleys_filtered_exp_amp = filter_small_amp_expiration_peak_valley( peaks=peaks_filtered_insp_amp, valleys=valleys_filtered_insp_amp, expiration_amplitude_threshold_perc=expiration_amplitude_threshold_perc ) peak_datastream = DataStream.from_datastream([rip]) peak_datastream.data = peaks_filtered_exp_amp valley_datastream = DataStream.from_datastream([rip]) valley_datastream.data = valleys_filtered_exp_amp return peak_datastream, valley_datastream
def timestamp_correct(datastream: DataStream, sampling_frequency: float, min_available_gaps: int = 3600, # TODO: Does this matter anymore? min_split_gap: datetime.timedelta = datetime.timedelta(seconds=30), max_data_points_per_segment: int = 100000000) -> DataStream: result = DataStream.from_datastream([datastream]) result.data = [] if len(datastream.data) == 0: return result data = datastream.data time_deltas = np.diff([dp.start_time for dp in data]) gap_points = [data[0]] for index, value in enumerate(time_deltas): if value > min_split_gap: gap_points.append(data[index]) gap_points.append(data[-1]) segments = [] segment_data = [] gap_index = 0 low_time = gap_points[gap_index].start_time high_time = gap_points[gap_index + 1].start_time for dp in data: if len(segment_data) >= max_data_points_per_segment: segments.append(interpolate_gaps(segment_data, sampling_frequency)) segment_data = [] if low_time <= dp.start_time <= high_time: segment_data.append(dp) else: segments.append(interpolate_gaps(segment_data, sampling_frequency)) gap_index += 1 low_time = gap_points[gap_index].start_time high_time = gap_points[gap_index + 1].start_time segment_data = [] segments.append(interpolate_gaps(segment_data, sampling_frequency)) for s in segments: begin_time = s[0].start_time.timestamp() end_time = s[-1].start_time.timestamp() x = np.array([i for i in frange(begin_time, end_time, 1.0 / sampling_frequency)], dtype='float') y = np.array([dp.start_time.timestamp() for dp in s], dtype='float') distance, path = fastdtw(x, y, radius=1) xx = [0 for i in y] for si, ei in path: xx[ei] = x[si] dtw_corrected_data = [] for index, dp in enumerate(s): ts = datetime.datetime.fromtimestamp(xx[index], tz=dp.start_time.tzinfo) dtw_corrected_data.append(DataPoint.from_tuple(ts, dp.sample)) result.data.extend(dtw_corrected_data) return result
def process_feature(file_path, metadata_path): f = open_data_file(file_path) mf = open(metadata_path) if f is None: return reader = csv.reader(f) count = 0 feature_data = {} start_column_number = 3 for row in reader: if count == 0: header_row = row count += 1 continue # handling corrupt data, some user id's are NA if row[0] not in user_id_mappings: continue user_id = user_id_mappings[row[0]] qualtrics_start_time = datetime.strptime(row[3], '%m/%d/%Y %H:%M') qualtrics_end_time = datetime.strptime(row[4], '%m/%d/%Y %H:%M') if len(user_id) == 4 and int( user_id[0]) == 5: # all 5xxx users are incentral qualtrics_start_time = centraltz.localize(qualtrics_start_time) qualtrics_end_time = centraltz.localize(qualtrics_end_time) elif len(user_id) == 4 and int( user_id[0]) == 1: # all 1xxx users are east qualtrics_start_time = easterntz.localize(qualtrics_start_time) qualtrics_end_time = easterntz.localize(qualtrics_end_time) elif len(user_id) == 4 and int( user_id[0]) == 9: # all 9xxx users are west qualtrics_start_time = pacifictz.localize(qualtrics_start_time) qualtrics_end_time = pacifictz.localize(qualtrics_end_time) else: qualtrics_start_time = centraltz.localize(qualtrics_start_time) qualtrics_end_time = centraltz.localize(qualtrics_end_time) utc_offset = qualtrics_start_time.utcoffset().total_seconds() * 1000 # -1000 - DataPoint expects offset to be in milliseconds and negative is # to account for being west of UTC sample = row[6:] values = None val = sample[0] #print('X'*20,val, len(val.strip())) if 'yes' in val or 'no' in val: # Check for Daily.tob.d.mitre.csv value = float('Nan') elif 'NA' in val: value = float('Nan') elif not len(val.strip()): value = float('Nan') else: value = float(val) q_dp = DataPoint(start_time=qualtrics_start_time, end_time=qualtrics_end_time, offset=utc_offset, sample=values) if user_id not in feature_data: feature_data[user_id] = [] feature_data[user_id].append(q_dp) metadata = mf.read() metadata = json.loads(metadata) metadata_name = metadata['name'] for user in feature_data: output_stream_id = str( uuid.uuid3(uuid.NAMESPACE_DNS, str(metadata_name + user + file_path))) q_dps = feature_data[user] q_ds = DataStream(identifier=output_stream_id, owner=user, name=metadata_name, data_descriptor=metadata['data_descriptor'], execution_context=metadata['execution_context'], annotations=metadata['annotations'], stream_type=1, data=q_dps) try: CC.save_stream(q_ds, localtime=True) except Exception as e: print(e) f.close() mf.close()
def analyze_user(self, userid, alldays, config_path): print(userid, alldays) self.CC = CerebralCortex(config_path) self.window_size = 3600 metadata = """ { "annotations":[], "data_descriptor":[ { "name":"total_datapoints", "type":"int", "description":"Total number of data points that are present in the input stream followed by an array of the corrupt datapoints", "stream_type": "sparse" } ], "execution_context":{ "processing_module":{ "name":"core.admission_control_marker.phone_stream_analyzer", "input_streams":[ { "name":"name", "identifier" : "id" } ] }, "algorithm":{ "method":"core.admission_control_marker", "authors":[ { "name":"Anand", "email":"*****@*****.**" } ], "version":"0.0.4", "description":"Analyzer for the phone input streams" } }, "name":"NAME_dynamically_generated" } """ date_format = '%Y%m%d' for day in alldays: for phone_stream in phone_input_streams: current_date = datetime.strptime(day, date_format) day_data = self.get_day_data(userid, day, phone_stream) data_quality_analysis = [] if len(day_data): corrupt_data = \ self.get_corrupt_data(day_data, phone_input_streams[phone_stream]) utc_offset = day_data[0].start_time.utcoffset( ).total_seconds() * 1000 dp = DataPoint(start_time=current_date, end_time=current_date + timedelta(days=1), offset=utc_offset, sample=[len(day_data), corrupt_data]) data_quality_analysis.append(dp) else: next_day = current_date + timedelta(days=1) utc_offset = 0 dp = DataPoint(start_time=current_date, end_time=next_day, offset=utc_offset, sample=[0, []]) data_quality_analysis.append(dp) metadata_json = json.loads(metadata) metadata_name = phone_stream + '_corrupt_data' output_stream_id = str( uuid.uuid3(uuid.NAMESPACE_DNS, str(metadata_name + userid + str(metadata)))) input_streams = [] input_stream_ids = self.CC.get_stream_id(userid, phone_stream) for inpstrm in input_stream_ids: stream_info = {} stream_info['name'] = phone_stream stream_info['identifier'] = inpstrm['identifier'] input_streams.append(stream_info) metadata_json["execution_context"]["processing_module"][ "input_streams"] = input_streams quality_ds = DataStream( identifier=output_stream_id, owner=userid, name=metadata_name, data_descriptor=metadata_json['data_descriptor'], execution_context=metadata_json['execution_context'], annotations=metadata_json['annotations'], stream_type=1, data=data_quality_analysis) try: self.CC.save_stream(quality_ds) except Exception as e: print(e)
def process_feature(file_path, metadata_path): f = open_data_file(file_path) mf = open(metadata_path) if f is None: return reader = csv.reader(f) count = 0 feature_data = {} start_column_number = 3 for row in reader: if count == 0: header_row = row count += 1 continue # handling corrupt data, some user id's are NA if row[0] not in user_id_mappings: continue user_id = user_id_mappings[row[0]] start_time = datetime.strptime(row[1], '%m/%d/%Y %H:%M') start_time = centraltz.localize(start_time) # handling the different format of the IGTB file if 'IGTB' not in file_path: end_time = datetime.strptime(row[2], '%m/%d/%Y %H:%M') else: end_time = datetime(year=start_time.year, month=start_time.month, day=start_time.day, hour=start_time.hour, minute=start_time.minute) start_column_number = 2 if 'IGTB' not in file_path: end_time = centraltz.localize(end_time) utc_offset = start_time.utcoffset().total_seconds() * 1000 # -1000 - DataPoint expects offset to be in milliseconds and negative is # to account for being west of UTC sample = row[5:] values = [] for val in sample: if 'yes' in val or 'no' in val: # Check for Daily.tob.d.mitre.csv continue if 'NA' in val: values.append(float('Nan')) else: values.append(float(val)) dp = DataPoint(start_time=start_time, end_time=end_time, offset=utc_offset, sample=values) if user_id not in feature_data: feature_data[user_id] = [] feature_data[user_id].append(dp) metadata = mf.read() metadata = json.loads(metadata) metadata_name = metadata['name'] for user in feature_data: output_stream_id = str( uuid.uuid3(uuid.NAMESPACE_DNS, str(metadata_name + user + file_path))) ds = DataStream(identifier=output_stream_id, owner=user, name=metadata_name, data_descriptor=metadata['data_descriptor'], execution_context=metadata['execution_context'], annotations=metadata['annotations'], stream_type=1, data=feature_data[user]) #print(str(user),str(output_stream_id),len(feature_data[user])) try: CC.save_stream(ds, localtime=True) except Exception as e: print(e) f.close() mf.close()
def detect_rpeak(ecg: DataStream, fs: float = 64, threshold: float = 0.5, blackman_win_len_range: float = 0.2) -> DataStream: """ This program implements the Pan Tomkins algorithm on ECG signal to detect the R peaks Since the ecg array can have discontinuity in the timestamp arrays the rr-interval calculated in the algorithm is calculated in terms of the index in the sample array The algorithm consists of some major steps 1. computation of the moving window integration of the signal in terms of blackman window of a prescribed length 2. compute all the peaks of the moving window integration signal 3. adaptive thresholding with dynamic signal and noise thresholds applied to filter out the R peak locations 4. confirm the R peaks through differentiation from the nearby peaks and remove the false peaks :param ecg: ecg array of tuples (timestamp,value) :param fs: sampling frequency :param threshold: initial threshold to detect the R peak in a signal normalized by the 90th percentile. .5 is default. :param blackman_win_len_range : the range to calculate blackman window length :return: R peak array of tuples (timestamp, Rpeak interval) """ data = ecg.data result = DataStream.from_datastream([ecg]) if len(data) == 0: result.data = [] return result sample = np.array([i.sample for i in data]) timestamp = np.array([i.start_time for i in data]) # computes the moving window integration of the signal blackman_win_len = np.ceil(fs * blackman_win_len_range) y = compute_moving_window_int(sample, fs, blackman_win_len) peak_location_values = [(i, y[i]) for i in range(2, len(y) - 1) if check_peak(y[i - 2:i + 3])] # initial RR interval average peak_location = [i[0] for i in peak_location_values] running_rr_avg = sum(np.diff(peak_location)) / (len(peak_location) - 1) rpeak_temp1 = compute_r_peaks(threshold, running_rr_avg, y, peak_location_values) rpeak_temp2 = remove_close_peaks(rpeak_temp1, sample, fs) index = confirm_peaks(rpeak_temp2, sample, fs) rpeak_timestamp = timestamp[index] rpeak_value = np.diff(rpeak_timestamp) rpeak_timestamp = rpeak_timestamp[1:] result_data = [] for k in range(len(rpeak_value)): result_data.append( DataPoint.from_tuple( rpeak_timestamp[k], rpeak_value[k].seconds + rpeak_value[k].microseconds / 1e6)) # Create resulting datastream to be returned result.data = result_data return result
def process_feature(file_path, metadata_path): f = open_data_file(file_path) mf = open(metadata_path) if f is None: return reader = csv.reader(f) count = 0 feature_data = {} start_column_number = 3 for row in reader: if count == 0: header_row = row count += 1 continue # handling corrupt data, some user id's are NA if row[0] not in user_id_mappings: continue user_id = user_id_mappings[row[0]] ems_start_time_str = row[1] + ' 12:00:00' ems_start_time = datetime.strptime(ems_start_time_str, '%Y%m%d %H:%M:%S') qualtrics_start_time = datetime.strptime(row[3], '%m/%d/%Y %H:%M') if len(user_id) == 4 and int( user_id[0]) == 5: # all 5xxx users are incentral ems_start_time = centraltz.localize(ems_start_time) qualtrics_start_time = centraltz.localize(qualtrics_start_time) elif len(user_id) == 4 and int( user_id[0]) == 1: # all 1xxx users are east ems_start_time = easterntz.localize(ems_start_time) qualtrics_start_time = easterntz.localize(qualtrics_start_time) elif len(user_id) == 4 and int( user_id[0]) == 9: # all 9xxx users are west ems_start_time = pacifictz.localize(ems_start_time) qualtrics_start_time = pacifictz.localize(qualtrics_start_time) else: ems_start_time = centraltz.localize(ems_start_time) qualtrics_start_time = centraltz.localize(qualtrics_start_time) # handling the different format of the IGTB file if 'IGTB' not in file_path: end_time = datetime.strptime(row[4], '%m/%d/%Y %H:%M') else: end_time = datetime(year=start_time.year, month=start_time.month, day=start_time.day, hour=start_time.hour, minute=start_time.minute) start_column_number = 2 if 'IGTB' not in file_path: end_time = centraltz.localize(end_time) utc_offset = ems_start_time.utcoffset().total_seconds() * 1000 # -1000 - DataPoint expects offset to be in milliseconds and negative is # to account for being west of UTC sample = row[6:] values = [] for val in sample: if 'yes' in val or 'no' in val: # Check for Daily.tob.d.mitre.csv continue if 'NA' in val: values.append(float('Nan')) else: values.append(float(val)) ems_dp = DataPoint(start_time=ems_start_time, end_time=end_time, offset=utc_offset, sample=values) q_dp = DataPoint(start_time=qualtrics_start_time, end_time=end_time, offset=utc_offset, sample=values) if user_id not in feature_data: feature_data[user_id] = [] feature_data[user_id].append((q_dp, ems_dp)) metadata = mf.read() metadata = json.loads(metadata) metadata_name = metadata['name'] for user in feature_data: output_stream_id = str( uuid.uuid3(uuid.NAMESPACE_DNS, str(metadata_name + user + file_path))) q_dps = [dp[0] for dp in feature_data[user]] q_ds = DataStream(identifier=output_stream_id, owner=user, name=metadata_name, data_descriptor=metadata['data_descriptor'], execution_context=metadata['execution_context'], annotations=metadata['annotations'], stream_type=1, data=q_dps) ems_stream_name = \ metadata_name.replace('data_qualtrics','data_qualtrics_ems') output_stream_id = str( uuid.uuid3(uuid.NAMESPACE_DNS, str(ems_stream_name + user + file_path))) ems_dps = [dp[1] for dp in feature_data[user]] ems_ds = DataStream(identifier=output_stream_id, owner=user, name=ems_stream_name, data_descriptor=metadata['data_descriptor'], execution_context=metadata['execution_context'], annotations=metadata['annotations'], stream_type=1, data=ems_dps) try: CC.save_stream(q_ds, localtime=True) except Exception as e: print(e) try: CC.save_stream(ems_ds, localtime=True) except Exception as e: print(e) f.close() mf.close()
def store(data: OrderedDict, input_streams: dict, output_streams: dict, CC_obj: CerebralCortex): """ Store diagnostic results with its metadata in the data-store :param input_streams: :param data: :param CC_obj: :param config: :param algo_type: """ if data: #basic output stream info owner = input_streams[0]["owner_id"] dd_stream_id = output_streams["id"] dd_stream_name = output_streams["name"] stream_type = "ds" data_descriptor = [{ "NAME": "Data Quality (LED)", "DATA_TYPE": "int", "FREQUENCY": "0.33", "MAX_VALUE": "4", "MIN_VALUE": "0", "DESCRIPTION": "measures the Data Quality of LED. Values= GOOD(0), BAND_OFF(1), NOT_WORN(2), BAND_LOOSE(3), NOISE(4)" }] execution_context = { "platform_metadata": { "NAME": "MotionSense HRV", "DEVICE_ID": "" }, "processing_module": { "name": "", "environment": "cerebralcortex", "algorithm": [{ "method": "", "authors": ["Nasir Ali", " Md Azim Ullah"], "version": "0.0.1", "reference": { "url": "http://md2k.org/" }, "description": "" }], "description": "", "input_streams": input_streams, "output_streams": output_streams, "input_parameters": {} }, "datasource_metadata": { "NAME": "Data Quality (LED)", "DATA_TYPE": "org.md2k.datakitapi.datatype.DataTypeInt", "FREQUENCY": "0.33", "DESCRIPTION": "measures the Data Quality of LED. Values= GOOD(0), BAND_OFF(1), NOT_WORN(2), BAND_LOOSE(3), NOISE(4)" }, "application_metadata": { "NAME": "MotionSense", "DESCRIPTION": "Collects data from the motion sense. Sensors supported: [Accelerometer, Gyroscope, Battery, LED, DataQuality]", "VERSION_NAME": "0.0.1", "VERSION_NUMBER": "2000500" } } annotations = [] ds = DataStream(identifier=dd_stream_id, owner=owner, name=dd_stream_name, data_descriptor=data_descriptor, execution_context=execution_context, annotations=annotations, stream_type=stream_type, data=data) CC_obj.save_datastream(ds, "datastream")
def compute_zero_cross_rate(ds, exclude_col_names: list = [], feature_names=['zero_cross_rate']): """ Compute statistical features. Args: ds (DataStream ): Windowed/grouped dataframe exclude_col_names list(str): name of the columns on which features should not be computed feature_names list(str): names of the features. Supported features are ['mean', 'median', 'stddev', 'variance', 'max', 'min', 'skew', 'kurt', 'sqr', 'zero_cross_rate' windowDuration (int): duration of a window in seconds slideDuration (int): slide duration of a window groupByColumnName List[str]: groupby column names, for example, groupby user, col1, col2 startTime (datetime): The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start window intervals. For example, in order to have hourly tumbling windows that start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide startTime as 15 minutes. First time of data will be used as startTime if none is provided Returns: DataStream object """ exclude_col_names.extend(["timestamp", "localtime", "user", "version"]) data = ds._data.drop(*exclude_col_names) df_column_names = data.columns basic_schema = StructType([ StructField("timestamp", TimestampType()), StructField("localtime", TimestampType()), StructField("user", StringType()), StructField("version", IntegerType()), StructField("start_time", TimestampType()), StructField("end_time", TimestampType()) ]) features_list = [] for cn in df_column_names: for sf in feature_names: features_list.append(StructField(cn + "_" + sf, FloatType(), True)) features_schema = StructType(basic_schema.fields + features_list) def calculate_zero_cross_rate(series): """ How often the signal changes sign (+/-) """ series_mean = np.mean(series) series = [v - series_mean for v in series] zero_cross_count = (np.diff(np.sign(series)) != 0).sum() return zero_cross_count / len(series) @pandas_udf(features_schema, PandasUDFType.GROUPED_MAP) def get_features_udf(df): results = [] timestamp = df['timestamp'].iloc[0] localtime = df['localtime'].iloc[0] user = df['user'].iloc[0] version = df['version'].iloc[0] start_time = timestamp end_time = df['timestamp'].iloc[-1] df.drop(exclude_col_names, axis=1, inplace=True) if "zero_cross_rate" in feature_names: df_zero_cross_rate = df.apply(calculate_zero_cross_rate) df_zero_cross_rate.index += '_zero_cross_rate' results.append(df_zero_cross_rate) output = pd.DataFrame(pd.concat(results)).T basic_df = pd.DataFrame([[timestamp, localtime, user, int(version), start_time, end_time]], columns=['timestamp', 'localtime', 'user', 'version', 'start_time', 'end_time']) return basic_df.assign(**output) # check if datastream object contains grouped type of DataFrame if not isinstance(ds._data, GroupedData): raise Exception( "DataStream object is not grouped data type. Please use 'window' operation on datastream object before running this algorithm") data = ds._data.apply(get_features_udf) return DataStream(data=data, metadata=Metadata())
def process(self): user_ids = self.filter_user_ids() # get all locations lats/longs all_locations = self.sqlData.get_latitude_llongitude() with open("weather_data.json", "r") as wd: metadata = wd.read() metadata = json.loads(metadata) input_stream_name = 'LOCATION--org.md2k.phonesensor--PHONE' for uid in user_ids: stream_ids = self.CC.get_stream_id(uid, input_stream_name) # START TEST CODE # location_id = self.get_location_id((37.439168,-122.086283), all_locations) # day = datetime.strptime("20171221", "%Y%m%d").strftime("%Y-%m-%d") # weather_data = self.sqlData.get_weather_data_by_city_id(location_id, day) # dps = [] # # for wd in weather_data: # dp_sample = [] # wd["temperature"] = json.loads(wd["temperature"]) # wd["wind"] = json.loads(wd["wind"]) # # dp_sample["sunrise"] = wd["sunrise"] # dp_sample["sunset"] = wd["sunset"] # dp_sample["wind_deg"] = wd.get("wind").get("deg","") # dp_sample["wind_speed"] = wd.get("wind").get("speed","") # dp_sample["current_temp"] = wd["temperature"]["temp"] # dp_sample["max_temp"] = wd["temperature"]["temp_max"] # dp_sample["min_temp"] = wd["temperature"]["temp_min"] # dp_sample["humidity"] = int(wd["humidity"]) # dp_sample["clouds"] = int(wd["clouds"]) # dp_sample["other"] = wd["other"] # dp_sample = [wd["sunrise"],wd["sunset"],wd.get("wind").get("deg",""),wd.get("wind").get("speed",""),wd["temperature"]["temp"],wd["temperature"]["temp_max"],wd["temperature"]["temp_min"],int(wd["humidity"]),int(wd["clouds"]),wd["other"]] # dps.append(DataPoint(wd["start_time"], None, None, dp_sample)) # END TEST CODE if len(stream_ids) > 0: print("Processing:", uid) for sid in stream_ids: sid = sid["identifier"] days = self.CC.get_stream_days(sid) for day in days: print("User ID, Stream ID, Day", uid, sid, day) output_stream_id = "" # get gps data from stream-name 'LOCATION--org.md2k.phonesensor--PHONE' location_stream = self.CC.get_stream(stream_id=sid, day=day) if len(location_stream.data) > 0: # compute median on lat. and long. vals user_loc = self.compute_lat_long_median( location_stream.data) if user_loc != (0, 0): offset = location_stream.data[0].offset # get weather data for match lat/long values location_id = self.get_location_id( user_loc, all_locations) if location_id is not None: formated_day = datetime.strptime( day, "%Y%m%d").strftime("%Y-%m-%d") weather_data = self.sqlData.get_weather_data_by_city_id( location_id, formated_day) # convert data into datastream execution_context = metadata[ "execution_context"] input_streams_metadata = [{ "id": sid, "name": input_stream_name }] metadata["execution_context"]["processing_module"]["input_streams"] \ = input_streams_metadata dps = [] for wd in weather_data: dp_sample = [] wd["temperature"] = json.loads( wd["temperature"]) wd["wind"] = json.loads(wd["wind"]) day_light_duration = ( (wd["sunset"] - wd["sunrise"]).seconds ) / 3600 # difference in hours dp_sample = [ wd["sunrise"], wd["sunset"], day_light_duration, wd.get("wind", float('nan')).get( "deg", float('nan')), wd.get("wind", float('nan')).get( "speed", float('nan')), wd["temperature"]["temp"], wd["temperature"]["temp_max"], wd["temperature"]["temp_min"], int(wd["humidity"]), int(wd["clouds"]), wd["other"] ] dps.append( DataPoint(wd["start_time"], None, offset, dp_sample)) if len(dps) > 0: # generate UUID for stream output_stream_id = str( metadata["data_descriptor"]) + str( execution_context) + str( metadata["annotations"]) output_stream_id += "weather-data-stream" output_stream_id += "weather-data-stream" output_stream_id += str(uid) output_stream_id += str(sid) # output_stream_id += str(day) output_stream_id = str( uuid.uuid3(uuid.NAMESPACE_DNS, output_stream_id)) ds = DataStream( identifier=output_stream_id, owner=uid, name=metadata["name"], data_descriptor=metadata[ "data_descriptor"], execution_context=execution_context, annotations=metadata[ "annotations"], stream_type=metadata["type"], data=dps) # store data stream self.CC.save_stream(ds)