def save_point(user, value, start_time, end_time, offset, metadata,
               stream_name_suffix):
    dp = DataPoint(start_time=start_time,
                   end_time=end_time,
                   offset=offset,
                   sample=[value])

    metadata_name = metadata['name']
    metadata_name = metadata_name + stream_name_suffix

    output_stream_id = str(
        uuid.uuid3(uuid.NAMESPACE_DNS,
                   str(metadata_name + user + str(metadata))))
    ds = DataStream(identifier=output_stream_id,
                    owner=user,
                    name=metadata_name,
                    data_descriptor=metadata['data_descriptor'],
                    execution_context=metadata['execution_context'],
                    annotations=metadata['annotations'],
                    stream_type=1,
                    data=[dp])
    #print(str(user),str(output_stream_id),len(feature_data[user]))
    try:
        CC.save_stream(ds, localtime=True)
    except Exception as e:
        print(e)
예제 #2
0
def filter_bad_ecg(ecg: DataStream, ecg_quality: DataStream) -> DataStream:
    """
    This function combines the raw ecg and ecg data quality datastream and only keeps those datapoints that are assigned acceptable in data quality

    :param ecg: raw ecg datastream
    :param ecg_quality: ecg quality datastream

    :return:  filtered ecg datastream
    """
    ecg_filtered = DataStream.from_datastream([ecg])
    ecg_quality_array = ecg_quality.data
    ecg_raw_timestamp_array = np.array(
        [i.start_time.timestamp() for i in ecg.data])
    ecg_filtered_array = []

    initial_index = 0
    for item in ecg_quality_array:
        if item.sample == Quality.ACCEPTABLE:
            final_index = initial_index
            for i in range(initial_index, len(ecg.data)):
                if item.start_time.timestamp(
                ) <= ecg_raw_timestamp_array[i] <= item.end_time.timestamp():
                    ecg_filtered_array.append(ecg.data[i])
                    final_index = i
            initial_index = final_index

    ecg_filtered.data = ecg_filtered_array

    return ecg_filtered
def store(data: OrderedDict, input_streams: dict, output_streams: dict, metadata, CC_obj: CerebralCortex, config: dict):
    """
    Store diagnostic results with its metadata in the data-store
    :param input_streams:
    :param data:
    :param CC_obj:
    :param config:
    :param algo_type:
    """
    if data:
        # basic output stream info
        owner = input_streams[0]["owner_id"]
        dd_stream_id = output_streams["id"]
        dd_stream_name = output_streams["name"]
        stream_type = "ds"

        data_descriptor = metadata["dd"]
        execution_context = metadata["ec"]
        annotations = metadata["anno"]

        ds = DataStream(identifier=dd_stream_id, owner=owner, name=dd_stream_name, data_descriptor=data_descriptor,
                        execution_context=execution_context, annotations=annotations,
                        stream_type=stream_type, data=data)

        CC_obj.save_datastream(ds, "datastream")
 def store(self, identifier, owner, name, data_descriptor, execution_context,
           annotations, stream_type=StreamTypes.DATASTREAM, data=None,
           localtime=True):
     '''
     All store operations MUST be through this method.
     '''
     if not data:
         self.CC.logging.log(error_type=LogTypes.MISSING_DATA, error_message
                             = 'Null data received for '
                               'saving stream from  ' + self.__class__.__name__)
         return
     
     ds = DataStream(identifier=identifier, owner=owner, name=name, 
                     data_descriptor=data_descriptor,
                     execution_context=execution_context, 
                     annotations=annotations,
                     stream_type=stream_type, data=data)
     try:
         self.CC.save_stream(datastream=ds, localtime=localtime)
         self.CC.logging.log('Saved %d data points stream id %s user_id '
                             '%s from %s' % 
                              (len(data), str(identifier), str(owner), 
                               self.__class__.__name__))
     except Exception as exp:
         self.CC.logging.log(self.__class__.__name__ + str(exp) + "\n" + 
                       str(traceback.format_exc()))
def autosense_sequence_align(datastreams: List[DataStream],
                             sampling_frequency: float) -> DataStream:
    result = DataStream.from_datastream(input_streams=datastreams)
    result.data = []

    if len(datastreams) == 0:
        return result

    start_time = None
    for ds in datastreams:
        ts = ds.data[0].start_time
        if not start_time:
            start_time = ts
        elif start_time < ts:
            start_time = ts

    start_time -= datetime.timedelta(seconds=1.0 / sampling_frequency)

    data_block = []
    max_index = np.Inf
    for ds in datastreams:
        d = [i for i in ds.data if i.start_time > start_time]
        if len(d) < max_index:
            max_index = len(d)
        data_block.append(d)

    data_array = np.array(data_block)

    dimensions = data_array.shape[0]
    for i in range(0, max_index):
        sample = [data_array[d][i].sample for d in range(0, dimensions)]
        result.data.append(DataPoint.from_tuple(data_array[0][i].start_time, sample))

    return result
예제 #6
0
def filter_bad_rip(rip: DataStream, rip_quality: DataStream) -> DataStream:
    """
    This function combines the raw rip and rip data quality datastream and only keeps those DataPoints that are assigned acceptable in data quality

    :param rip: raw respiration datastream
    :param rip_quality: respiration quality datastream

    :return:  filtered respiration datastream
    """
    rip_filtered = DataStream.from_datastream([rip])
    rip_quality_array = rip_quality.data
    rip_raw_timestamp_array = np.array(
        [i.start_time.timestamp() for i in rip.data])
    rip_filtered_array = []

    initial_index = 0
    for item in rip_quality_array:
        if item.sample == Quality.ACCEPTABLE:
            final_index = initial_index
            for i in range(initial_index, len(rip.data)):
                if item.start_time.timestamp(
                ) <= rip_raw_timestamp_array[i] <= item.end_time.timestamp():
                    rip_filtered_array.append(rip.data[i])
                    final_index = i
            initial_index = final_index

    rip_filtered.data = rip_filtered_array

    return rip_filtered
예제 #7
0
def interpolate(ds,
                freq=16,
                method='linear',
                axis=0,
                limit=None,
                inplace=False,
                limit_direction='forward',
                limit_area=None,
                downcast=None):
    """
    Interpolate values according to different methods. This method internally uses pandas interpolation.

    Args:
        ds (DataStream): Windowed/grouped DataStream object
        freq (int): Frequency of the signal
        method (str): default ‘linear’
            - ‘linear’: Ignore the index and treat the values as equally spaced. This is the only method supported on MultiIndexes.
            - ‘time’: Works on daily and higher resolution data to interpolate given length of interval.
            - ‘index’, ‘values’: use the actual numerical values of the index.
            - ‘pad’: Fill in NaNs using existing values.
            - ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’, ‘polynomial’: Passed to scipy.interpolate.interp1d. These methods use the numerical values of the index. Both ‘polynomial’ and ‘spline’ require that you also specify an order (int), e.g. df.interpolate(method='polynomial', order=5).
            - ‘krogh’, ‘piecewise_polynomial’, ‘spline’, ‘pchip’, ‘akima’: Wrappers around the SciPy interpolation methods of similar names. See Notes.
            - ‘from_derivatives’: Refers to scipy.interpolate.BPoly.from_derivatives which replaces ‘piecewise_polynomial’ interpolation method in scipy 0.18.
        axis  {0 or ‘index’, 1 or ‘columns’, None}: default None. Axis to interpolate along.
        limit (int): optional. Maximum number of consecutive NaNs to fill. Must be greater than 0.
        inplace (bool): default False. Update the data in place if possible.
        limit_direction {‘forward’, ‘backward’, ‘both’}: default ‘forward’. If limit is specified, consecutive NaNs will be filled in this direction.
        limit_area  {None, ‘inside’, ‘outside’}: default None. If limit is specified, consecutive NaNs will be filled with this restriction.
            - None: No fill restriction.
            - ‘inside’: Only fill NaNs surrounded by valid values (interpolate).
            - ‘outside’: Only fill NaNs outside valid values (extrapolate).
        downcast optional, ‘infer’ or None: defaults to None
        **kwargs: Keyword arguments to pass on to the interpolating function.

    Returns DataStream: interpolated data

    """
    schema = ds._data.schema
    sample_freq = 1000 / freq

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def interpolate_data(pdf):
        pdf.set_index("timestamp", inplace=True)
        pdf = pdf.resample(str(sample_freq) + "ms").bfill(limit=1).interpolate(
            method=method,
            axis=axis,
            limit=limit,
            inplace=inplace,
            limit_direction=limit_direction,
            limit_area=limit_area,
            downcast=downcast)
        pdf.ffill(inplace=True)
        pdf.reset_index(drop=False, inplace=True)
        pdf.sort_index(axis=1, inplace=True)
        return pdf

    data = ds._data.groupby(["user", "version"]).apply(interpolate_data)
    return DataStream(data=data, metadata=Metadata())
예제 #8
0
def accelerometer_features(
        accel: DataStream,
        window_length: float = 10.0,
        activity_threshold: float = 0.21,
        percentile_low: int = 1,
        percentile_high: int = 99
) -> Tuple[DataStream, DataStream, DataStream]:
    """

    References:
        Figure 3: http://www.cs.memphis.edu/~santosh/Papers/Timing-JIT-UbiComp-2014.pdf

    :param percentile_high:
    :param percentile_low:
    :param accel:
    :param window_length:
    :param activity_threshold:
    :return:
    """
    accelerometer_magnitude = magnitude(normalize(accel))

    accelerometer_win_mag_deviations_data = []
    for key, data in window(accelerometer_magnitude.data,
                            window_length).items():
        accelerometer_win_mag_deviations_data.append(
            window_std_dev(data, key[0]))

    accelerometer_win_mag_deviations = DataStream.from_datastream([accel])
    accelerometer_win_mag_deviations.data = accelerometer_win_mag_deviations_data

    am_values = np.array([dp.sample for dp in accelerometer_magnitude.data])
    low_limit = np.percentile(am_values, percentile_low)
    high_limit = np.percentile(am_values, percentile_high)
    range = high_limit - low_limit

    accel_activity_data = []
    for dp in accelerometer_win_mag_deviations_data:
        comparison = dp.sample > (low_limit + activity_threshold * range)
        accel_activity_data.append(
            DataPoint.from_tuple(dp.start_time, comparison))

    accel_activity = DataStream.from_datastream([accel])
    accel_activity.data = accel_activity_data

    return accelerometer_magnitude, accelerometer_win_mag_deviations, accel_activity
def process_save_stream(msg: dict, cc_config_path: str):
    """
    Process one of kafka messages, add gaussian noise to data and store data as a new stream

    Args:
        msg (dict): kafka message - {'filename': str, 'metadata_hash': str, "stream_name": str, "user_id": str}
        cc_config_path (str): path of cerebralcortex configs

    Notes:
        This method creates CC object again. This code is running on worker node. Thus, it won't have access to CC object created in run()
        CC object cannot be passed to worker nodes because it contains sockets and sockets cannot be serialized in spark to pass as a parameter

    """

    # Disable pandas warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)

    CC = Kernel(cc_config_path, enable_spark=False)
    cc_config = CC.config
    stream_name = msg.get("stream_name")
    user_id = msg.get("user_id")

    if cc_config["nosql_storage"] == "filesystem":
        file_name = str(
            cc_config["filesystem"]["filesystem_path"]) + msg.get("filename")
    elif cc_config["nosql_storage"] == "hdfs":
        file_name = str(
            cc_config["hdfs"]["raw_files_dir"]) + msg.get("filename")
    else:
        raise Exception(
            str(cc_config["nosql_storage"]) +
            " is not supported. Please use filesystem or hdfs.")

    if os.path.exists(file_name):
        data = pq.read_table(file_name)
        pdf = data.to_pandas()

        pdf = add_gaussian_noise(pdf)

        new_stream_name = stream_name + "_gaussian_noise"

        metadata = Metadata().set_name(new_stream_name).set_description("Gaussian noise added to the accel sensor stream.") \
            .add_dataDescriptor(
            DataDescriptor().set_attribute("description", "noisy accel x")) \
            .add_dataDescriptor(
            DataDescriptor().set_attribute("description", "noisy accel y")) \
            .add_dataDescriptor(
            DataDescriptor().set_attribute("description", "noisy accel z")) \
            .add_module(
            ModuleMetadata().set_name("cerebralcortex.streaming_operation.main").set_version("0.0.1").set_attribute("description", "Spark streaming example using CerebralCortex. This example adds gaussian noise to a stream data.").set_author(
                "test_user", "test_user@test_email.com"))

        pdf["user"] = user_id
        ds = DataStream(data=pdf, metadata=metadata)
        CC.save_stream(ds)
    else:
        print(file_name, "does not exist.")
예제 #10
0
def ecg_data_quality(datastream: DataStream,
                     window_size: float = 2.0,
                     acceptable_outlier_percent: float = .34,
                     outlier_threshold_high: float = .9769,
                     outlier_threshold_low: float = .004884,
                     ecg_threshold_band_loose: float = .01148,
                     ecg_threshold_slope: float = .02443,
                     buffer_length: int = 3) -> DataStream:
    """

    :param datastream: Input ECG datastream
    :param window_size: Window size specifying the number of seconds the datastream is divided to check for data quality
    :param acceptable_outlier_percent: The acceptable outlier percentage in a window default is 34 percent
    :param outlier_threshold_high: The percentage of ADC range above which any value is considered an outlier
    :param outlier_threshold_low: The percentage of ADC range below which any value is considered an outlier
    :param ecg_threshold_band_loose: The Band Loose Threshold for ECG signal expressed in the percentage of ADC range
    :param ecg_threshold_slope: The Slope threshold of ECG signal- No consecutive DataPoints can have this
    difference in values(expressed as percentage of ADC range)
    :param buffer_length: This specifies the memory of the data quality computation. Meaning this number of past windows
    will also have a role to decide the quality of the current window

    :return: An Annotated Datastream of ECG Data quality specifying the time ranges when data quality was acceptable/non-acceptable
    """

    ecg_quality_stream = DataStream.from_datastream(input_streams=[datastream])
    window_data = window(datastream.data, window_size=window_size)

    ecg_quality = []
    ecg_range = []
    for key, data in window_data.items():
        if len(data) > 0:
            result = compute_data_quality(data, ecg_range, True,
                                          ecg_threshold_band_loose,
                                          ecg_threshold_slope,
                                          acceptable_outlier_percent,
                                          outlier_threshold_high,
                                          outlier_threshold_low, buffer_length)
            if not ecg_quality:
                ecg_quality.append(
                    DataPoint.from_tuple(data[0].start_time, result,
                                         data[-1].start_time))
            else:
                if ecg_quality[-1].sample == result:
                    new_point = DataPoint.from_tuple(
                        ecg_quality[-1].start_time, result,
                        data[-1].start_time)
                    ecg_quality[-1] = new_point
                else:
                    ecg_quality.append(
                        DataPoint.from_tuple(data[0].start_time, result,
                                             data[-1].start_time))

    ecg_quality_stream.data = ecg_quality

    return ecg_quality_stream
예제 #11
0
def compute_outlier_ecg(ecg_rr: DataStream) -> DataStream:
    """
    Reference - Berntson, Gary G., et al. "An approach to artifact identification: Application to heart period data."
    Psychophysiology 27.5 (1990): 586-598.

    :param ecg_rr: RR interval datastream

    :return: An annotated datastream specifying when the ECG RR interval datastream is acceptable
    """

    ecg_rr_outlier_stream = DataStream.from_datastream(input_streams=[ecg_rr])
    if not ecg_rr.data:
        ecg_rr_outlier_stream.data = []
        return ecg_rr_outlier_stream

    valid_rr_interval_sample = [
        i.sample for i in ecg_rr.data if i.sample > .3 and i.sample < 2
    ]
    valid_rr_interval_time = [
        i.start_time for i in ecg_rr.data if i.sample > .3 and i.sample < 2
    ]
    valid_rr_interval_difference = abs(np.diff(valid_rr_interval_sample))

    # Maximum Expected Difference(MED)= 3.32* Quartile Deviation
    maximum_expected_difference = 4.5 * 0.5 * iqr(valid_rr_interval_difference)

    # Shortest Expected Beat(SEB) = Median Beat – 2.9 * Quartile Deviation
    # Minimal Artifact Difference(MAD) = SEB/ 3
    maximum_artifact_difference = (np.median(valid_rr_interval_sample) - 2.9 *
                                   .5 * iqr(valid_rr_interval_difference)) / 3

    # Midway between MED and MAD is considered
    criterion_beat_difference = (maximum_expected_difference +
                                 maximum_artifact_difference) / 2
    if criterion_beat_difference < .2:
        criterion_beat_difference = .2

    ecg_rr_quality_array = [
        DataPoint.from_tuple(valid_rr_interval_time[0], Quality.ACCEPTABLE,
                             valid_rr_interval_time[0])
    ]

    for data in outlier_computation(valid_rr_interval_time,
                                    valid_rr_interval_sample,
                                    criterion_beat_difference):
        if ecg_rr_quality_array[-1].sample == data.sample:
            new_point = DataPoint.from_tuple(
                ecg_rr_quality_array[-1].start_time, data.sample,
                data.start_time)
            ecg_rr_quality_array[-1] = new_point
        else:
            ecg_rr_quality_array.append(data)

    ecg_rr_outlier_stream.data = ecg_rr_quality_array
    return ecg_rr_outlier_stream
예제 #12
0
    def setUpClass(self):
        self.pp = PhoneFeatures()
        self.data = []
        for t in range(10, 1, -1):
            currentTime = datetime.datetime.now()
            self.data.append(
                DataPoint(currentTime - datetime.timedelta(hours=t - .1),
                          currentTime - datetime.timedelta(hours=t - .9), t))

        ownerUUID = uuid.uuid4()

        phonedata = []
        self.phoneDataStream = DataStream(identifier=uuid.uuid4(),
                                          owner=ownerUUID)
        self.phoneDataStream.data = phonedata

        smsdata = []
        self.smsDataStream = DataStream(identifier=uuid.uuid4(),
                                        owner=ownerUUID)
        self.smsDataStream.data = smsdata
def save(identifier, owner, name, data_descriptor, execution_context,
         annotations, stream_type, data):
    ds = DataStream(identifier=identifier, owner=owner, name=name, 
                    data_descriptor=data_descriptor,
                    execution_context=execution_context, 
                    annotations=annotations,
                    stream_type=stream_type, data=data)


    try:
        CC.save_stream(ds)
        print("Saved %d data points"%(len(data)))
    except Exception as e:
       print(traceback.format_exc()) 
def magnitude(datastream: DataStream) -> DataStream:
    """

    :param datastream:
    :return:
    """
    result = DataStream.from_datastream(input_streams=[datastream])
    if datastream.data is None or len(datastream.data) == 0:
        result.data = []
        return result

    input_data = np.array([i.sample for i in datastream.data])

    data = norm(input_data, axis=1).tolist()

    result.data = [DataPoint.from_tuple(start_time=v.start_time, sample=data[i])
                   for i, v in enumerate(datastream.data)]

    return result
예제 #15
0
def complementary_filter(ds, freq: int = 16, accelerometer_x: str = "accelerometer_x",
                         accelerometer_y: str = "accelerometer_y", accelerometer_z: str = "accelerometer_z",
                         gyroscope_x: str = "gyroscope_x", gyroscope_y: str = "gyroscope_y",
                         gyroscope_z: str = "gyroscope_z"):
    """
    Compute complementary filter on gyro and accel data.

    Args:
        ds (DataStream ): Non-Windowed/grouped dataframe
        freq (int): frequency of accel/gryo. Assumption is that frequency is equal for both gyro and accel.
        accelerometer_x (str): name of the column
        accelerometer_y (str): name of the column
        accelerometer_z (str): name of the column
        gyroscope_x (str): name of the column
        gyroscope_y (str): name of the column
        gyroscope_z (str): name of the column
    """
    dt = 1.0 / freq  # 1/16.0;
    M_PI = math.pi;
    hpf = 0.90;
    lpf = 0.10;


    window = Window.partitionBy(ds._data['user']).orderBy(ds._data['timestamp'])

    data = ds._data.withColumn("thetaX_accel",
                               ((F.atan2(-F.col(accelerometer_z), F.col(accelerometer_y)) * 180 / M_PI)) * lpf) \
        .withColumn("roll",
                    (F.lag("thetaX_accel").over(window) + F.col(gyroscope_x) * dt) * hpf + F.col("thetaX_accel")).drop(
        "thetaX_accel") \
        .withColumn("thetaY_accel",
                    ((F.atan2(-F.col(accelerometer_x), F.col(accelerometer_z)) * 180 / M_PI)) * lpf) \
        .withColumn("pitch",
                    (F.lag("thetaY_accel").over(window) + F.col(gyroscope_y) * dt) * hpf + F.col("thetaY_accel")).drop(
        "thetaY_accel") \
        .withColumn("thetaZ_accel",
                    ((F.atan2(-F.col(accelerometer_y), F.col(accelerometer_x)) * 180 / M_PI)) * lpf) \
        .withColumn("yaw",
                    (F.lag("thetaZ_accel").over(window) + F.col(gyroscope_z) * dt) * hpf + F.col("thetaZ_accel")).drop(
        "thetaZ_accel")

    return DataStream(data=data.dropna(), metadata=Metadata())
예제 #16
0
def magnitude(ds, col_names=[]):
    """
    Compute magnitude of columns

    Args:
        ds (DataStream): Windowed/grouped DataStream object
        col_names (list[str]): column names

    Returns:
        DataStream

    """
    if len(col_names) < 1:
        raise Exception("col_names param cannot be empty list.")

    tmp = ""
    for col_name in col_names:
        tmp += 'F.col("' + col_name + '")*F.col("' + col_name + '")+'
    tmp = tmp.rstrip("+")

    data = ds._data.withColumn("magnitude", F.sqrt(eval(tmp)))
    return DataStream(data=data, metadata=Metadata())
예제 #17
0
def json_to_datastream(json_obj, stream_type):
    data = json_obj["data"]
    metadata = json_obj["metadata"]
    identifier = metadata["identifier"]
    owner = metadata["owner"]
    name = metadata["name"]
    data_descriptor = metadata["data_descriptor"]
    execution_context = metadata["execution_context"]
    annotations = metadata["annotations"]
    stream_type = stream_type
    start_time = data[0]["starttime"]
    end_time = data[len(data) - 1]["starttime"]
    datapoints = list(map(json_to_datapoints, data))

    return DataStream(identifier,
                      owner,
                      name,
                      data_descriptor,
                      execution_context,
                      annotations,
                      stream_type,
                      start_time,
                      end_time,
                      datapoints)
예제 #18
0
def compute_FFT_features(ds, exclude_col_names: list = [],
                         feature_names=["fft_centroid", 'fft_spread', 'spectral_entropy', 'fft_flux',
                                            'spectral_falloff']):
    """
    Transforms data from time domain to frequency domain.

    Args:
        exclude_col_names list(str): name of the columns on which features should not be computed
        feature_names list(str): names of the features. Supported features are fft_centroid, fft_spread, spectral_entropy, spectral_entropy_old, fft_flux, spectral_falloff
        windowDuration (int): duration of a window in seconds
        slideDuration (int): slide duration of a window
        groupByColumnName List[str]: groupby column names, for example, groupby user, col1, col2
        startTime (datetime): The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start window intervals. For example, in order to have hourly tumbling windows that start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide startTime as 15 minutes. First time of data will be used as startTime if none is provided


    Returns:
        DataStream object with all the existing data columns and FFT features
    """
    eps = 0.00000001

    exclude_col_names.extend(["timestamp", "localtime", "user", "version"])

    data = ds._data.drop(*exclude_col_names)

    df_column_names = data.columns

    basic_schema = StructType([
        StructField("timestamp", TimestampType()),
        StructField("localtime", TimestampType()),
        StructField("user", StringType()),
        StructField("version", IntegerType()),
        StructField("start_time", TimestampType()),
        StructField("end_time", TimestampType())
    ])

    features_list = []
    for cn in df_column_names:
        for sf in feature_names:
            features_list.append(StructField(cn + "_" + sf, FloatType(), True))

    features_schema = StructType(basic_schema.fields + features_list)

    def stSpectralCentroidAndSpread(X, fs):
        """Computes spectral centroid of frame (given abs(FFT))"""
        ind = (np.arange(1, len(X) + 1)) * (fs / (2.0 * len(X)))

        Xt = X.copy()
        Xt = Xt / Xt.max()
        NUM = np.sum(ind * Xt)
        DEN = np.sum(Xt) + eps

        # Centroid:
        C = (NUM / DEN)

        # Spread:
        S = np.sqrt(np.sum(((ind - C) ** 2) * Xt) / DEN)

        # Normalize:
        C = C / (fs / 2.0)
        S = S / (fs / 2.0)

        return (C, S)

    def stSpectralFlux(X, Xprev):
        """
        Computes the spectral flux feature of the current frame
        ARGUMENTS:
            X:        the abs(fft) of the current frame
            Xpre:        the abs(fft) of the previous frame
        """
        # compute the spectral flux as the sum of square distances:

        sumX = np.sum(X + eps)
        sumPrevX = np.sum(Xprev + eps)
        F = np.sum((X / sumX - Xprev / sumPrevX) ** 2)

        return F

    def stSpectralRollOff(X, c, fs):
        """Computes spectral roll-off"""

        totalEnergy = np.sum(X ** 2)
        fftLength = len(X)
        Thres = c * totalEnergy
        # Ffind the spectral rolloff as the frequency position where the respective spectral energy is equal to c*totalEnergy
        CumSum = np.cumsum(X ** 2) + eps
        [a, ] = np.nonzero(CumSum > Thres)
        if len(a) > 0:
            mC = np.float64(a[0]) / (float(fftLength))
        else:
            mC = 0.0
        return (mC)

    def stSpectralEntropy(X, numOfShortBlocks=10):
        """Computes the spectral entropy"""
        L = len(X)  # number of frame samples
        Eol = np.sum(X ** 2)  # total spectral energy

        subWinLength = int(np.floor(L / numOfShortBlocks))  # length of sub-frame
        if L != subWinLength * numOfShortBlocks:
            X = X[0:subWinLength * numOfShortBlocks]

        subWindows = X.reshape(subWinLength, numOfShortBlocks,
                               order='F').copy()  # define sub-frames (using matrix reshape)
        s = np.sum(subWindows ** 2, axis=0) / (Eol + eps)  # compute spectral sub-energies
        En = -np.sum(s * np.log2(s + eps))  # compute spectral entropy

        return En

    def spectral_entropy(data, sampling_freq, bands=None):

        psd = np.abs(np.fft.rfft(data)) ** 2
        psd /= np.sum(psd)  # psd as a pdf (normalised to one)

        if bands is None:
            power_per_band = psd[psd > 0]
        else:
            freqs = np.fft.rfftfreq(data.size, 1 / float(sampling_freq))
            bands = np.asarray(bands)

            freq_limits_low = np.concatenate([[0.0], bands])
            freq_limits_up = np.concatenate([bands, [np.Inf]])

            power_per_band = [np.sum(psd[np.bitwise_and(freqs >= low, freqs < up)])
                              for low, up in zip(freq_limits_low, freq_limits_up)]

            power_per_band = power_per_band[power_per_band > 0]

        return -np.sum(power_per_band * np.log2(power_per_band))

    def fourier_features_pandas_udf(data, frequency: float = 16.0):

        Fs = frequency  # the sampling freq (in Hz)
        results = []
        # fourier transforms!
        # data_fft = abs(np.fft.rfft(data))

        X = abs(np.fft.fft(data))
        nFFT = int(len(X) / 2) + 1

        X = X[0:nFFT]  # normalize fft
        X = X / len(X)

        if "fft_centroid" or "fft_spread" in feature_names:
            C, S = stSpectralCentroidAndSpread(X, Fs)  # spectral centroid and spread
            if "fft_centroid" in feature_names:
                results.append(C)
            if "fft_spread" in feature_names:
                results.append(S)
        if "spectral_entropy" in feature_names:
            se = stSpectralEntropy(X)  # spectral entropy
            results.append(se)
        if "spectral_entropy_old" in feature_names:
            se_old = spectral_entropy(X, frequency)  # spectral flux
            results.append(se_old)
        if "fft_flux" in feature_names:
            flx = stSpectralFlux(X, X.copy())  # spectral flux
            results.append(flx)
        if "spectral_folloff" in feature_names:
            roff = stSpectralRollOff(X, 0.90, frequency)  # spectral rolloff
            results.append(roff)
        return pd.Series(results)

    @pandas_udf(features_schema, PandasUDFType.GROUPED_MAP)
    def get_fft_features(df):
        timestamp = df['timestamp'].iloc[0]
        localtime = df['localtime'].iloc[0]
        user = df['user'].iloc[0]
        version = df['version'].iloc[0]
        start_time = timestamp
        end_time = df['timestamp'].iloc[-1]

        df.drop(exclude_col_names, axis=1, inplace=True)

        df_ff = df.apply(fourier_features_pandas_udf)
        df3 = df_ff.T
        pd.set_option('display.max_colwidth', -1)

        df3.columns = feature_names

        # multiple rows to one row
        output = df3.unstack().to_frame().sort_index(level=1).T
        output.columns = [f'{j}_{i}' for i, j in output.columns]

        basic_df = pd.DataFrame([[timestamp, localtime, user, int(version), start_time, end_time]],
                                columns=['timestamp', 'localtime', 'user', 'version', 'start_time', 'end_time'])
        # df.insert(loc=0, columns=, value=basic_cols)
        return basic_df.assign(**output)

    # check if datastream object contains grouped type of DataFrame
    if not isinstance(ds._data, GroupedData):
        raise Exception(
            "DataStream object is not grouped data type. Please use 'window' operation on datastream object before running this algorithm")

    data = ds._data.apply(get_fft_features)
    return DataStream(data=data, metadata=Metadata())
예제 #19
0
def statistical_features(ds,
                         exclude_col_names: list = [],
                         feature_names=[
                             'mean', 'median', 'stddev', 'variance', 'max',
                             'min', 'skew', 'kurt', 'sqr'
                         ]):
    """
    Compute statistical features.

    Args:
        ds (DataStream): Windowed/grouped DataStream object
        exclude_col_names list(str): name of the columns on which features should not be computed
        feature_names list(str): names of the features. Supported features are ['mean', 'median', 'stddev', 'variance', 'max', 'min', 'skew',
                     'kurt', 'sqr', 'zero_cross_rate'

    Returns:
        DataStream object with all the existing data columns and FFT features
    """
    exclude_col_names.extend(["timestamp", "localtime", "user", "version"])

    data = ds._data._df.drop(*exclude_col_names)

    df_column_names = data.columns

    basic_schema = StructType([
        StructField("timestamp", TimestampType()),
        StructField("localtime", TimestampType()),
        StructField("user", StringType()),
        StructField("version", IntegerType()),
        StructField("start_time", TimestampType()),
        StructField("end_time", TimestampType())
    ])

    features_list = []
    for cn in df_column_names:
        for sf in feature_names:
            features_list.append(StructField(cn + "_" + sf, FloatType(), True))

    features_schema = StructType(basic_schema.fields + features_list)

    def calculate_zero_cross_rate(series):
        """
        How often the signal changes sign (+/-)
        """
        series_mean = np.mean(series)
        series = [v - series_mean for v in series]
        zero_cross_count = (np.diff(np.sign(series)) != 0).sum()
        return zero_cross_count / len(series)

    def get_sqr(series):
        sqr = np.mean([v * v for v in series])
        return sqr

    @pandas_udf(features_schema, PandasUDFType.GROUPED_MAP)
    def get_stats_features_udf(df):
        results = []
        timestamp = df['timestamp'].iloc[0]
        localtime = df['localtime'].iloc[0]
        user = df['user'].iloc[0]
        version = df['version'].iloc[0]
        start_time = timestamp
        end_time = df['timestamp'].iloc[-1]

        df.drop(exclude_col_names, axis=1, inplace=True)

        if "mean" in feature_names:
            df_mean = df.mean()
            df_mean.index += '_mean'
            results.append(df_mean)

        if "median" in feature_names:
            df_median = df.median()
            df_median.index += '_median'
            results.append(df_median)

        if "stddev" in feature_names:
            df_stddev = df.std()
            df_stddev.index += '_stddev'
            results.append(df_stddev)

        if "variance" in feature_names:
            df_var = df.var()
            df_var.index += '_variance'
            results.append(df_var)

        if "max" in feature_names:
            df_max = df.max()
            df_max.index += '_max'
            results.append(df_max)

        if "min" in feature_names:
            df_min = df.min()
            df_min.index += '_min'
            results.append(df_min)

        if "skew" in feature_names:
            df_skew = df.skew()
            df_skew.index += '_skew'
            results.append(df_skew)

        if "kurt" in feature_names:
            df_kurt = df.kurt()
            df_kurt.index += '_kurt'
            results.append(df_kurt)

        if "sqr" in feature_names:
            df_sqr = df.apply(get_sqr)
            df_sqr.index += '_sqr'
            results.append(df_sqr)

        output = pd.DataFrame(pd.concat(results)).T

        basic_df = pd.DataFrame(
            [[timestamp, localtime, user,
              int(version), start_time, end_time]],
            columns=[
                'timestamp', 'localtime', 'user', 'version', 'start_time',
                'end_time'
            ])
        return basic_df.assign(**output)

    # check if datastream object contains grouped type of DataFrame
    if not isinstance(ds._data, GroupedData):
        raise Exception(
            "DataStream object is not grouped data type. Please use 'window' operation on datastream object before running this algorithm"
        )

    data = ds._data.apply(get_stats_features_udf)
    return DataStream(data=data, metadata=Metadata())
예제 #20
0
def compute_peak_valley(
        rip: DataStream,
        rip_quality: DataStream,
        fs: float = 21.33,
        smoothing_factor: int = 5,
        time_window: int = 8,
        expiration_amplitude_threshold_perc: float = 0.10,
        threshold_expiration_duration: float = 0.312,
        inspiration_amplitude_threshold_perc: float = 0.10,
        max_amplitude_change_peak_correction: float = 30,
        min_neg_slope_count_peak_correction: int = 4,
        minimum_peak_to_valley_time_diff=0.31) -> [DataStream, DataStream]:
    """
    Compute peak and valley from rip data and filter peak and valley.

    :param minimum_peak_to_valley_time_diff:
    :param inspiration_amplitude_threshold_perc:
    :param smoothing_factor:
    :return peak_datastream, valley_datastream:
    :param rip:
    :param rip_quality:
    :param fs:
    :param time_window:
    :param expiration_amplitude_threshold_perc:
    :param threshold_expiration_duration:
    :param max_amplitude_change_peak_correction:
    :param min_neg_slope_count_peak_correction:
    """

    rip_filtered = filter_bad_rip(rip=rip, rip_quality=rip_quality)

    data_smooth = smooth(data=rip_filtered.data, span=smoothing_factor)
    window_length = int(round(time_window * fs))
    data_mac = moving_average_curve(data_smooth, window_length=window_length)

    data_smooth_start_time_to_index = {}
    for index, data in enumerate(data_smooth):
        data_smooth_start_time_to_index[data.start_time] = index

    up_intercepts, down_intercepts = up_down_intercepts(
        data=data_smooth,
        mac=data_mac,
        data_start_time_to_index=data_smooth_start_time_to_index)

    up_intercepts_filtered, down_intercepts_filtered = filter_intercept_outlier(
        up_intercepts=up_intercepts, down_intercepts=down_intercepts)

    peaks, valleys = generate_peak_valley(
        up_intercepts=up_intercepts_filtered,
        down_intercepts=down_intercepts_filtered,
        data=data_smooth)

    valleys_corrected = correct_valley_position(
        peaks=peaks,
        valleys=valleys,
        up_intercepts=up_intercepts_filtered,
        data=data_smooth,
        data_start_time_to_index=data_smooth_start_time_to_index)

    peaks_corrected = correct_peak_position(
        peaks=peaks,
        valleys=valleys_corrected,
        up_intercepts=up_intercepts_filtered,
        data=data_smooth,
        max_amplitude_change_peak_correction=
        max_amplitude_change_peak_correction,
        min_neg_slope_count_peak_correction=min_neg_slope_count_peak_correction,
        data_start_time_to_index=data_smooth_start_time_to_index)

    # remove too close valley peak pair.
    peaks_filtered_close, valleys_filtered_close = remove_close_valley_peak_pair(
        peaks=peaks_corrected,
        valleys=valleys_corrected,
        minimum_peak_to_valley_time_diff=minimum_peak_to_valley_time_diff)

    # Remove small  Expiration duration < 0.31
    peaks_filtered_exp_dur, valleys_filtered_exp_dur = filter_expiration_duration_outlier(
        peaks=peaks_filtered_close,
        valleys=valleys_filtered_close,
        threshold_expiration_duration=threshold_expiration_duration)

    # filter out peak valley pair of inspiration of small amplitude.
    peaks_filtered_insp_amp, valleys_filtered_insp_amp = filter_small_amp_inspiration_peak_valley(
        peaks=peaks_filtered_exp_dur,
        valleys=valleys_filtered_exp_dur,
        inspiration_amplitude_threshold_perc=
        inspiration_amplitude_threshold_perc)

    # filter out peak valley pair of expiration of small amplitude.
    peaks_filtered_exp_amp, valleys_filtered_exp_amp = filter_small_amp_expiration_peak_valley(
        peaks=peaks_filtered_insp_amp,
        valleys=valleys_filtered_insp_amp,
        expiration_amplitude_threshold_perc=expiration_amplitude_threshold_perc
    )

    peak_datastream = DataStream.from_datastream([rip])
    peak_datastream.data = peaks_filtered_exp_amp
    valley_datastream = DataStream.from_datastream([rip])
    valley_datastream.data = valleys_filtered_exp_amp

    return peak_datastream, valley_datastream
def timestamp_correct(datastream: DataStream,
                      sampling_frequency: float,
                      min_available_gaps: int = 3600,  # TODO: Does this matter anymore?
                      min_split_gap: datetime.timedelta = datetime.timedelta(seconds=30),
                      max_data_points_per_segment: int = 100000000) -> DataStream:
    result = DataStream.from_datastream([datastream])
    result.data = []

    if len(datastream.data) == 0:
        return result

    data = datastream.data
    time_deltas = np.diff([dp.start_time for dp in data])

    gap_points = [data[0]]
    for index, value in enumerate(time_deltas):
        if value > min_split_gap:
            gap_points.append(data[index])
    gap_points.append(data[-1])

    segments = []
    segment_data = []
    gap_index = 0
    low_time = gap_points[gap_index].start_time
    high_time = gap_points[gap_index + 1].start_time
    for dp in data:
        if len(segment_data) >= max_data_points_per_segment:
            segments.append(interpolate_gaps(segment_data, sampling_frequency))
            segment_data = []

        if low_time <= dp.start_time <= high_time:
            segment_data.append(dp)
        else:
            segments.append(interpolate_gaps(segment_data, sampling_frequency))
            gap_index += 1
            low_time = gap_points[gap_index].start_time
            high_time = gap_points[gap_index + 1].start_time
            segment_data = []

    segments.append(interpolate_gaps(segment_data, sampling_frequency))

    for s in segments:
        begin_time = s[0].start_time.timestamp()
        end_time = s[-1].start_time.timestamp()

        x = np.array([i for i in frange(begin_time, end_time, 1.0 / sampling_frequency)], dtype='float')
        y = np.array([dp.start_time.timestamp() for dp in s], dtype='float')

        distance, path = fastdtw(x, y, radius=1)

        xx = [0 for i in y]
        for si, ei in path:
            xx[ei] = x[si]

        dtw_corrected_data = []
        for index, dp in enumerate(s):
            ts = datetime.datetime.fromtimestamp(xx[index], tz=dp.start_time.tzinfo)
            dtw_corrected_data.append(DataPoint.from_tuple(ts, dp.sample))

        result.data.extend(dtw_corrected_data)

    return result
예제 #22
0
def process_feature(file_path, metadata_path):
    f = open_data_file(file_path)
    mf = open(metadata_path)

    if f is None: return

    reader = csv.reader(f)
    count = 0
    feature_data = {}
    start_column_number = 3

    for row in reader:
        if count == 0:
            header_row = row
            count += 1
            continue

        # handling corrupt data, some user id's are NA
        if row[0] not in user_id_mappings: continue

        user_id = user_id_mappings[row[0]]

        qualtrics_start_time = datetime.strptime(row[3], '%m/%d/%Y %H:%M')
        qualtrics_end_time = datetime.strptime(row[4], '%m/%d/%Y %H:%M')

        if len(user_id) == 4 and int(
                user_id[0]) == 5:  # all 5xxx users are incentral
            qualtrics_start_time = centraltz.localize(qualtrics_start_time)
            qualtrics_end_time = centraltz.localize(qualtrics_end_time)
        elif len(user_id) == 4 and int(
                user_id[0]) == 1:  # all 1xxx users are east
            qualtrics_start_time = easterntz.localize(qualtrics_start_time)
            qualtrics_end_time = easterntz.localize(qualtrics_end_time)
        elif len(user_id) == 4 and int(
                user_id[0]) == 9:  # all 9xxx users are west
            qualtrics_start_time = pacifictz.localize(qualtrics_start_time)
            qualtrics_end_time = pacifictz.localize(qualtrics_end_time)
        else:
            qualtrics_start_time = centraltz.localize(qualtrics_start_time)
            qualtrics_end_time = centraltz.localize(qualtrics_end_time)

        utc_offset = qualtrics_start_time.utcoffset().total_seconds() * 1000
        # -1000 - DataPoint expects offset to be in milliseconds and negative is
        # to account for being west of UTC

        sample = row[6:]
        values = None
        val = sample[0]
        #print('X'*20,val, len(val.strip()))
        if 'yes' in val or 'no' in val:  # Check for Daily.tob.d.mitre.csv
            value = float('Nan')
        elif 'NA' in val:
            value = float('Nan')
        elif not len(val.strip()):
            value = float('Nan')
        else:
            value = float(val)

        q_dp = DataPoint(start_time=qualtrics_start_time,
                         end_time=qualtrics_end_time,
                         offset=utc_offset,
                         sample=values)

        if user_id not in feature_data:
            feature_data[user_id] = []

        feature_data[user_id].append(q_dp)

    metadata = mf.read()
    metadata = json.loads(metadata)
    metadata_name = metadata['name']

    for user in feature_data:
        output_stream_id = str(
            uuid.uuid3(uuid.NAMESPACE_DNS,
                       str(metadata_name + user + file_path)))
        q_dps = feature_data[user]

        q_ds = DataStream(identifier=output_stream_id,
                          owner=user,
                          name=metadata_name,
                          data_descriptor=metadata['data_descriptor'],
                          execution_context=metadata['execution_context'],
                          annotations=metadata['annotations'],
                          stream_type=1,
                          data=q_dps)

        try:
            CC.save_stream(q_ds, localtime=True)
        except Exception as e:
            print(e)

    f.close()
    mf.close()
예제 #23
0
    def analyze_user(self, userid, alldays, config_path):
        print(userid, alldays)
        self.CC = CerebralCortex(config_path)
        self.window_size = 3600
        metadata = """
        {
          "annotations":[],
          "data_descriptor":[
            {
              "name":"total_datapoints",
              "type":"int",
              "description":"Total number of data points that are present in the input stream followed by an array of the corrupt datapoints",
              "stream_type": "sparse"
            }
          ],
          "execution_context":{
            "processing_module":{
              "name":"core.admission_control_marker.phone_stream_analyzer",
              "input_streams":[
                {
                  "name":"name",
                  "identifier" : "id"
                }
              ]
            },
            "algorithm":{
              "method":"core.admission_control_marker",
              "authors":[
                {
                  "name":"Anand",
                  "email":"*****@*****.**"
                }
              ],
              "version":"0.0.4",
              "description":"Analyzer for the phone input streams"
            }
          },
          "name":"NAME_dynamically_generated"
        }
        """

        date_format = '%Y%m%d'
        for day in alldays:
            for phone_stream in phone_input_streams:
                current_date = datetime.strptime(day, date_format)
                day_data = self.get_day_data(userid, day, phone_stream)
                data_quality_analysis = []

                if len(day_data):
                    corrupt_data = \
                                self.get_corrupt_data(day_data,
                                                             phone_input_streams[phone_stream])

                    utc_offset = day_data[0].start_time.utcoffset(
                    ).total_seconds() * 1000
                    dp = DataPoint(start_time=current_date,
                                   end_time=current_date + timedelta(days=1),
                                   offset=utc_offset,
                                   sample=[len(day_data), corrupt_data])
                    data_quality_analysis.append(dp)

                else:
                    next_day = current_date + timedelta(days=1)
                    utc_offset = 0
                    dp = DataPoint(start_time=current_date,
                                   end_time=next_day,
                                   offset=utc_offset,
                                   sample=[0, []])
                    data_quality_analysis.append(dp)

                metadata_json = json.loads(metadata)
                metadata_name = phone_stream + '_corrupt_data'
                output_stream_id = str(
                    uuid.uuid3(uuid.NAMESPACE_DNS,
                               str(metadata_name + userid + str(metadata))))
                input_streams = []
                input_stream_ids = self.CC.get_stream_id(userid, phone_stream)
                for inpstrm in input_stream_ids:
                    stream_info = {}
                    stream_info['name'] = phone_stream
                    stream_info['identifier'] = inpstrm['identifier']
                    input_streams.append(stream_info)

                metadata_json["execution_context"]["processing_module"][
                    "input_streams"] = input_streams

                quality_ds = DataStream(
                    identifier=output_stream_id,
                    owner=userid,
                    name=metadata_name,
                    data_descriptor=metadata_json['data_descriptor'],
                    execution_context=metadata_json['execution_context'],
                    annotations=metadata_json['annotations'],
                    stream_type=1,
                    data=data_quality_analysis)
                try:
                    self.CC.save_stream(quality_ds)
                except Exception as e:
                    print(e)
def process_feature(file_path, metadata_path):
    f = open_data_file(file_path)
    mf = open(metadata_path)

    if f is None: return

    reader = csv.reader(f)
    count = 0
    feature_data = {}
    start_column_number = 3

    for row in reader:
        if count == 0:
            header_row = row
            count += 1
            continue

        # handling corrupt data, some user id's are NA
        if row[0] not in user_id_mappings: continue

        user_id = user_id_mappings[row[0]]
        start_time = datetime.strptime(row[1], '%m/%d/%Y %H:%M')
        start_time = centraltz.localize(start_time)

        # handling the different format of the IGTB file
        if 'IGTB' not in file_path:
            end_time = datetime.strptime(row[2], '%m/%d/%Y %H:%M')
        else:
            end_time = datetime(year=start_time.year,
                                month=start_time.month,
                                day=start_time.day,
                                hour=start_time.hour,
                                minute=start_time.minute)
            start_column_number = 2

        if 'IGTB' not in file_path:
            end_time = centraltz.localize(end_time)

        utc_offset = start_time.utcoffset().total_seconds() * 1000
        # -1000 - DataPoint expects offset to be in milliseconds and negative is
        # to account for being west of UTC

        sample = row[5:]
        values = []
        for val in sample:
            if 'yes' in val or 'no' in val:  # Check for Daily.tob.d.mitre.csv
                continue
            if 'NA' in val:
                values.append(float('Nan'))
            else:
                values.append(float(val))

        dp = DataPoint(start_time=start_time,
                       end_time=end_time,
                       offset=utc_offset,
                       sample=values)

        if user_id not in feature_data:
            feature_data[user_id] = []

        feature_data[user_id].append(dp)

    metadata = mf.read()
    metadata = json.loads(metadata)
    metadata_name = metadata['name']

    for user in feature_data:
        output_stream_id = str(
            uuid.uuid3(uuid.NAMESPACE_DNS,
                       str(metadata_name + user + file_path)))
        ds = DataStream(identifier=output_stream_id,
                        owner=user,
                        name=metadata_name,
                        data_descriptor=metadata['data_descriptor'],
                        execution_context=metadata['execution_context'],
                        annotations=metadata['annotations'],
                        stream_type=1,
                        data=feature_data[user])
        #print(str(user),str(output_stream_id),len(feature_data[user]))
        try:
            CC.save_stream(ds, localtime=True)
        except Exception as e:
            print(e)
    f.close()
    mf.close()
예제 #25
0
def detect_rpeak(ecg: DataStream,
                 fs: float = 64,
                 threshold: float = 0.5,
                 blackman_win_len_range: float = 0.2) -> DataStream:
    """
    This program implements the Pan Tomkins algorithm on ECG signal to detect the R peaks

    Since the ecg array can have discontinuity in the timestamp arrays the rr-interval calculated
    in the algorithm is calculated in terms of the index in the sample array

    The algorithm consists of some major steps

    1. computation of the moving window integration of the signal in terms of blackman window of a prescribed length
    2. compute all the peaks of the moving window integration signal
    3. adaptive thresholding with dynamic signal and noise thresholds applied to filter out the R peak locations
    4. confirm the R peaks through differentiation from the nearby peaks and remove the false peaks

    :param ecg: ecg array of tuples (timestamp,value)
    :param fs: sampling frequency
    :param threshold: initial threshold to detect the R peak in a signal normalized by the 90th percentile. .5 is default.
    :param blackman_win_len_range : the range to calculate blackman window length

    :return: R peak array of tuples (timestamp, Rpeak interval)
    """

    data = ecg.data
    result = DataStream.from_datastream([ecg])
    if len(data) == 0:
        result.data = []
        return result
    sample = np.array([i.sample for i in data])
    timestamp = np.array([i.start_time for i in data])

    # computes the moving window integration of the signal
    blackman_win_len = np.ceil(fs * blackman_win_len_range)
    y = compute_moving_window_int(sample, fs, blackman_win_len)

    peak_location_values = [(i, y[i]) for i in range(2,
                                                     len(y) - 1)
                            if check_peak(y[i - 2:i + 3])]

    # initial RR interval average
    peak_location = [i[0] for i in peak_location_values]
    running_rr_avg = sum(np.diff(peak_location)) / (len(peak_location) - 1)

    rpeak_temp1 = compute_r_peaks(threshold, running_rr_avg, y,
                                  peak_location_values)
    rpeak_temp2 = remove_close_peaks(rpeak_temp1, sample, fs)
    index = confirm_peaks(rpeak_temp2, sample, fs)

    rpeak_timestamp = timestamp[index]
    rpeak_value = np.diff(rpeak_timestamp)
    rpeak_timestamp = rpeak_timestamp[1:]

    result_data = []
    for k in range(len(rpeak_value)):
        result_data.append(
            DataPoint.from_tuple(
                rpeak_timestamp[k],
                rpeak_value[k].seconds + rpeak_value[k].microseconds / 1e6))

    # Create resulting datastream to be returned

    result.data = result_data

    return result
def process_feature(file_path, metadata_path):
    f = open_data_file(file_path)
    mf = open(metadata_path)

    if f is None: return

    reader = csv.reader(f)
    count = 0
    feature_data = {}
    start_column_number = 3

    for row in reader:
        if count == 0:
            header_row = row
            count += 1
            continue

        # handling corrupt data, some user id's are NA
        if row[0] not in user_id_mappings: continue

        user_id = user_id_mappings[row[0]]

        ems_start_time_str = row[1] + ' 12:00:00'
        ems_start_time = datetime.strptime(ems_start_time_str,
                                           '%Y%m%d %H:%M:%S')
        qualtrics_start_time = datetime.strptime(row[3], '%m/%d/%Y %H:%M')

        if len(user_id) == 4 and int(
                user_id[0]) == 5:  # all 5xxx users are incentral
            ems_start_time = centraltz.localize(ems_start_time)
            qualtrics_start_time = centraltz.localize(qualtrics_start_time)
        elif len(user_id) == 4 and int(
                user_id[0]) == 1:  # all 1xxx users are east
            ems_start_time = easterntz.localize(ems_start_time)
            qualtrics_start_time = easterntz.localize(qualtrics_start_time)
        elif len(user_id) == 4 and int(
                user_id[0]) == 9:  # all 9xxx users are west
            ems_start_time = pacifictz.localize(ems_start_time)
            qualtrics_start_time = pacifictz.localize(qualtrics_start_time)
        else:
            ems_start_time = centraltz.localize(ems_start_time)
            qualtrics_start_time = centraltz.localize(qualtrics_start_time)

        # handling the different format of the IGTB file
        if 'IGTB' not in file_path:
            end_time = datetime.strptime(row[4], '%m/%d/%Y %H:%M')
        else:
            end_time = datetime(year=start_time.year,
                                month=start_time.month,
                                day=start_time.day,
                                hour=start_time.hour,
                                minute=start_time.minute)
            start_column_number = 2

        if 'IGTB' not in file_path:
            end_time = centraltz.localize(end_time)

        utc_offset = ems_start_time.utcoffset().total_seconds() * 1000
        # -1000 - DataPoint expects offset to be in milliseconds and negative is
        # to account for being west of UTC

        sample = row[6:]
        values = []
        for val in sample:
            if 'yes' in val or 'no' in val:  # Check for Daily.tob.d.mitre.csv
                continue
            if 'NA' in val:
                values.append(float('Nan'))
            else:
                values.append(float(val))

        ems_dp = DataPoint(start_time=ems_start_time,
                           end_time=end_time,
                           offset=utc_offset,
                           sample=values)
        q_dp = DataPoint(start_time=qualtrics_start_time,
                         end_time=end_time,
                         offset=utc_offset,
                         sample=values)

        if user_id not in feature_data:
            feature_data[user_id] = []

        feature_data[user_id].append((q_dp, ems_dp))

    metadata = mf.read()
    metadata = json.loads(metadata)
    metadata_name = metadata['name']

    for user in feature_data:
        output_stream_id = str(
            uuid.uuid3(uuid.NAMESPACE_DNS,
                       str(metadata_name + user + file_path)))
        q_dps = [dp[0] for dp in feature_data[user]]

        q_ds = DataStream(identifier=output_stream_id,
                          owner=user,
                          name=metadata_name,
                          data_descriptor=metadata['data_descriptor'],
                          execution_context=metadata['execution_context'],
                          annotations=metadata['annotations'],
                          stream_type=1,
                          data=q_dps)

        ems_stream_name = \
        metadata_name.replace('data_qualtrics','data_qualtrics_ems')
        output_stream_id = str(
            uuid.uuid3(uuid.NAMESPACE_DNS,
                       str(ems_stream_name + user + file_path)))
        ems_dps = [dp[1] for dp in feature_data[user]]
        ems_ds = DataStream(identifier=output_stream_id,
                            owner=user,
                            name=ems_stream_name,
                            data_descriptor=metadata['data_descriptor'],
                            execution_context=metadata['execution_context'],
                            annotations=metadata['annotations'],
                            stream_type=1,
                            data=ems_dps)
        try:
            CC.save_stream(q_ds, localtime=True)
        except Exception as e:
            print(e)
        try:
            CC.save_stream(ems_ds, localtime=True)
        except Exception as e:
            print(e)
    f.close()
    mf.close()
def store(data: OrderedDict, input_streams: dict, output_streams: dict,
          CC_obj: CerebralCortex):
    """
    Store diagnostic results with its metadata in the data-store
    :param input_streams:
    :param data:
    :param CC_obj:
    :param config:
    :param algo_type:
    """
    if data:
        #basic output stream info
        owner = input_streams[0]["owner_id"]
        dd_stream_id = output_streams["id"]
        dd_stream_name = output_streams["name"]
        stream_type = "ds"

        data_descriptor = [{
            "NAME":
            "Data Quality (LED)",
            "DATA_TYPE":
            "int",
            "FREQUENCY":
            "0.33",
            "MAX_VALUE":
            "4",
            "MIN_VALUE":
            "0",
            "DESCRIPTION":
            "measures the Data Quality of LED. Values= GOOD(0), BAND_OFF(1), NOT_WORN(2), BAND_LOOSE(3), NOISE(4)"
        }]
        execution_context = {
            "platform_metadata": {
                "NAME": "MotionSense HRV",
                "DEVICE_ID": ""
            },
            "processing_module": {
                "name":
                "",
                "environment":
                "cerebralcortex",
                "algorithm": [{
                    "method": "",
                    "authors": ["Nasir Ali", " Md Azim Ullah"],
                    "version": "0.0.1",
                    "reference": {
                        "url": "http://md2k.org/"
                    },
                    "description": ""
                }],
                "description":
                "",
                "input_streams":
                input_streams,
                "output_streams":
                output_streams,
                "input_parameters": {}
            },
            "datasource_metadata": {
                "NAME":
                "Data Quality (LED)",
                "DATA_TYPE":
                "org.md2k.datakitapi.datatype.DataTypeInt",
                "FREQUENCY":
                "0.33",
                "DESCRIPTION":
                "measures the Data Quality of LED. Values= GOOD(0), BAND_OFF(1), NOT_WORN(2), BAND_LOOSE(3), NOISE(4)"
            },
            "application_metadata": {
                "NAME": "MotionSense",
                "DESCRIPTION":
                "Collects data from the motion sense. Sensors supported: [Accelerometer, Gyroscope, Battery, LED, DataQuality]",
                "VERSION_NAME": "0.0.1",
                "VERSION_NUMBER": "2000500"
            }
        }
        annotations = []

        ds = DataStream(identifier=dd_stream_id,
                        owner=owner,
                        name=dd_stream_name,
                        data_descriptor=data_descriptor,
                        execution_context=execution_context,
                        annotations=annotations,
                        stream_type=stream_type,
                        data=data)

        CC_obj.save_datastream(ds, "datastream")
예제 #28
0
def compute_zero_cross_rate(ds, exclude_col_names: list = [],
                            feature_names=['zero_cross_rate']):
    """
    Compute statistical features.

    Args:
        ds (DataStream ): Windowed/grouped dataframe
        exclude_col_names list(str): name of the columns on which features should not be computed
        feature_names list(str): names of the features. Supported features are ['mean', 'median', 'stddev', 'variance', 'max', 'min', 'skew',
                     'kurt', 'sqr', 'zero_cross_rate'
        windowDuration (int): duration of a window in seconds
        slideDuration (int): slide duration of a window
        groupByColumnName List[str]: groupby column names, for example, groupby user, col1, col2
        startTime (datetime): The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start window intervals. For example, in order to have hourly tumbling windows that start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide startTime as 15 minutes. First time of data will be used as startTime if none is provided


    Returns:
        DataStream object
    """
    exclude_col_names.extend(["timestamp", "localtime", "user", "version"])

    data = ds._data.drop(*exclude_col_names)

    df_column_names = data.columns

    basic_schema = StructType([
        StructField("timestamp", TimestampType()),
        StructField("localtime", TimestampType()),
        StructField("user", StringType()),
        StructField("version", IntegerType()),
        StructField("start_time", TimestampType()),
        StructField("end_time", TimestampType())
    ])

    features_list = []
    for cn in df_column_names:
        for sf in feature_names:
            features_list.append(StructField(cn + "_" + sf, FloatType(), True))

    features_schema = StructType(basic_schema.fields + features_list)

    def calculate_zero_cross_rate(series):
        """
        How often the signal changes sign (+/-)
        """
        series_mean = np.mean(series)
        series = [v - series_mean for v in series]
        zero_cross_count = (np.diff(np.sign(series)) != 0).sum()
        return zero_cross_count / len(series)

    @pandas_udf(features_schema, PandasUDFType.GROUPED_MAP)
    def get_features_udf(df):
        results = []
        timestamp = df['timestamp'].iloc[0]
        localtime = df['localtime'].iloc[0]
        user = df['user'].iloc[0]
        version = df['version'].iloc[0]
        start_time = timestamp
        end_time = df['timestamp'].iloc[-1]

        df.drop(exclude_col_names, axis=1, inplace=True)
        if "zero_cross_rate" in feature_names:
            df_zero_cross_rate = df.apply(calculate_zero_cross_rate)
            df_zero_cross_rate.index += '_zero_cross_rate'
            results.append(df_zero_cross_rate)

        output = pd.DataFrame(pd.concat(results)).T

        basic_df = pd.DataFrame([[timestamp, localtime, user, int(version), start_time, end_time]],
                                columns=['timestamp', 'localtime', 'user', 'version', 'start_time', 'end_time'])
        return basic_df.assign(**output)

    # check if datastream object contains grouped type of DataFrame
    if not isinstance(ds._data, GroupedData):
        raise Exception(
            "DataStream object is not grouped data type. Please use 'window' operation on datastream object before running this algorithm")

    data = ds._data.apply(get_features_udf)
    return DataStream(data=data, metadata=Metadata())
    def process(self):
        user_ids = self.filter_user_ids()
        # get all locations lats/longs
        all_locations = self.sqlData.get_latitude_llongitude()
        with open("weather_data.json", "r") as wd:
            metadata = wd.read()
        metadata = json.loads(metadata)
        input_stream_name = 'LOCATION--org.md2k.phonesensor--PHONE'
        for uid in user_ids:
            stream_ids = self.CC.get_stream_id(uid, input_stream_name)

            # START TEST CODE
            # location_id = self.get_location_id((37.439168,-122.086283), all_locations)
            # day = datetime.strptime("20171221", "%Y%m%d").strftime("%Y-%m-%d")
            # weather_data = self.sqlData.get_weather_data_by_city_id(location_id, day)
            # dps = []
            #
            # for wd in weather_data:
            #     dp_sample = []
            #     wd["temperature"] = json.loads(wd["temperature"])
            #     wd["wind"] = json.loads(wd["wind"])
            #
            #     dp_sample["sunrise"] = wd["sunrise"]
            #     dp_sample["sunset"] = wd["sunset"]
            #     dp_sample["wind_deg"] = wd.get("wind").get("deg","")
            #     dp_sample["wind_speed"] = wd.get("wind").get("speed","")
            #     dp_sample["current_temp"] = wd["temperature"]["temp"]
            #     dp_sample["max_temp"] = wd["temperature"]["temp_max"]
            #     dp_sample["min_temp"] = wd["temperature"]["temp_min"]
            #     dp_sample["humidity"] = int(wd["humidity"])
            #     dp_sample["clouds"] = int(wd["clouds"])
            #     dp_sample["other"] = wd["other"]
            #     dp_sample = [wd["sunrise"],wd["sunset"],wd.get("wind").get("deg",""),wd.get("wind").get("speed",""),wd["temperature"]["temp"],wd["temperature"]["temp_max"],wd["temperature"]["temp_min"],int(wd["humidity"]),int(wd["clouds"]),wd["other"]]
            #     dps.append(DataPoint(wd["start_time"], None, None, dp_sample))
            # END TEST CODE
            if len(stream_ids) > 0:
                print("Processing:", uid)
                for sid in stream_ids:
                    sid = sid["identifier"]
                    days = self.CC.get_stream_days(sid)
                    for day in days:
                        print("User ID, Stream ID, Day", uid, sid, day)
                        output_stream_id = ""
                        # get gps data from stream-name 'LOCATION--org.md2k.phonesensor--PHONE'
                        location_stream = self.CC.get_stream(stream_id=sid,
                                                             day=day)

                        if len(location_stream.data) > 0:
                            # compute median on lat. and long. vals
                            user_loc = self.compute_lat_long_median(
                                location_stream.data)
                            if user_loc != (0, 0):
                                offset = location_stream.data[0].offset
                                # get weather data for match lat/long values
                                location_id = self.get_location_id(
                                    user_loc, all_locations)

                                if location_id is not None:
                                    formated_day = datetime.strptime(
                                        day, "%Y%m%d").strftime("%Y-%m-%d")
                                    weather_data = self.sqlData.get_weather_data_by_city_id(
                                        location_id, formated_day)

                                    # convert data into datastream
                                    execution_context = metadata[
                                        "execution_context"]
                                    input_streams_metadata = [{
                                        "id":
                                        sid,
                                        "name":
                                        input_stream_name
                                    }]
                                    metadata["execution_context"]["processing_module"]["input_streams"] \
                                        = input_streams_metadata
                                    dps = []
                                    for wd in weather_data:
                                        dp_sample = []
                                        wd["temperature"] = json.loads(
                                            wd["temperature"])
                                        wd["wind"] = json.loads(wd["wind"])
                                        day_light_duration = (
                                            (wd["sunset"] -
                                             wd["sunrise"]).seconds
                                        ) / 3600  # difference in hours
                                        dp_sample = [
                                            wd["sunrise"], wd["sunset"],
                                            day_light_duration,
                                            wd.get("wind", float('nan')).get(
                                                "deg", float('nan')),
                                            wd.get("wind", float('nan')).get(
                                                "speed", float('nan')),
                                            wd["temperature"]["temp"],
                                            wd["temperature"]["temp_max"],
                                            wd["temperature"]["temp_min"],
                                            int(wd["humidity"]),
                                            int(wd["clouds"]), wd["other"]
                                        ]

                                        dps.append(
                                            DataPoint(wd["start_time"], None,
                                                      offset, dp_sample))
                                    if len(dps) > 0:
                                        # generate UUID for stream
                                        output_stream_id = str(
                                            metadata["data_descriptor"]) + str(
                                                execution_context) + str(
                                                    metadata["annotations"])
                                        output_stream_id += "weather-data-stream"
                                        output_stream_id += "weather-data-stream"
                                        output_stream_id += str(uid)
                                        output_stream_id += str(sid)
                                        # output_stream_id += str(day)
                                        output_stream_id = str(
                                            uuid.uuid3(uuid.NAMESPACE_DNS,
                                                       output_stream_id))
                                        ds = DataStream(
                                            identifier=output_stream_id,
                                            owner=uid,
                                            name=metadata["name"],
                                            data_descriptor=metadata[
                                                "data_descriptor"],
                                            execution_context=execution_context,
                                            annotations=metadata[
                                                "annotations"],
                                            stream_type=metadata["type"],
                                            data=dps)

                                        # store data stream
                                        self.CC.save_stream(ds)