def segment_data(): """ Clean the data """ print("segment data") study_list = retrieve_ref('study_list') sensor_list = retrieve_ref('sensor_list') segment_list = retrieve_ref('segment_list') timePreStudy = retrieve_ref('timePreStudy') timePostStudy = retrieve_ref('timePostStudy') for study in study_list: df_meta = retrieve_meta(study) source_path = list(df_meta['source_path']) for record in source_path: for sensor in sensor_list: for segment in segment_list: # print('segment_list') # print(segment_list) analysis_type = 'truncate' df = retrieve_analyzed(study, analysis_type, record, sensor) if segment == segment_list[0]: timeEnd = timePreStudy df = df.drop(df[df['timeMinutes'] > timeEnd].index) if segment == segment_list[1]: timeBegin = timePreStudy timeEnd = timePostStudy df = df.drop(df[df['timeMinutes'] < timeBegin].index) df = df.drop(df[df['timeMinutes'] > timeEnd].index) if segment == segment_list[2]: timeBegin = timePostStudy df = df.drop(df[df['timeMinutes'] < timeBegin].index) path = os.path.join(study, 'segment') if not os.path.isdir(path): os.mkdir(path) # print(path) path = os.path.join(study, 'segment', str(segment)) if not os.path.isdir(path): os.mkdir(path) # print(path) path = os.path.join(study, 'segment', str(segment), record) if not os.path.isdir(path): os.mkdir(path) # print(path) path = os.path.join(study, 'segment', str(segment), record, sensor + ".csv") df.to_csv(path) print('segments file saved: ' + str(path))
def segment_records(): """ segment records """ print("begin segmenting records") study_list = retrieve_ref('study_list') format_types = retrieve_ref('format_types') segment_list = retrieve_ref('segment_list') sensor_list = retrieve_ref('sensor_list') timePreStudy = retrieve_ref('timePreStudy') timePostStudy = retrieve_ref('timePostStudy') for study in study_list: df_meta = retrieve_meta(study) source_path = list(df_meta['source_path']) for record in source_path: for format in format_types: for sensor in sensor_list: df = retrieve_analyzed(study, format, record, 'All', sensor) for segment in segment_list: if segment == segment_list[0]: timeEnd = timePreStudy df = df.drop(df[df['timeMinutes'] > timeEnd].index) if segment == segment_list[1]: timeBegin = timePreStudy timeEnd = timePostStudy df = df.drop( df[df['timeMinutes'] < timeBegin].index) df = df.drop(df[df['timeMinutes'] > timeEnd].index) if segment == segment_list[2]: timeBegin = timePostStudy df = df.drop( df[df['timeMinutes'] < timeBegin].index) path = os.path.join(study, 'formatted', format, record, segment) if not os.path.isdir(path): os.mkdir(path) file_path = os.path.join(study, 'formatted', format, record, segment, sensor + ".csv") df.to_csv(file_path) print('dataframe saved for segments: ' + str(file_path))
def multiple_record_check(): """ check the record for multiple records """ print("begin multiple record check") study_list = retrieve_ref('study_list') format_types = retrieve_ref('format_types') segment_list = retrieve_ref('segment_list') sensor_list = retrieve_ref('sensor_list') timePreStudy = retrieve_ref('timePreStudy') timePostStudy = retrieve_ref('timePostStudy') for study in study_list: df_meta = retrieve_meta(study) source_path = list(df_meta['source_path']) source_path_new = list(df_meta['source_path']) timeBegin_list = list(df_meta['recordBegin']) timeEnd_list = list(df_meta['recordEnd']) for record in source_path: i = df_meta[df_meta['source_path'] == record].index.values[0] fullLength = float(df_meta.loc[i, 'fullLength']) truncatedLength = float(df_meta.loc[i, 'truncatedLength']) format_type = 'source' segment = 'All' sensor = 'TEMP' df = retrieve_analyzed(study, format_type, record, segment, sensor) new_record_list = [] if fullLength > truncatedLength + 30: df = df.drop(df[df['timeMinutes'] < truncatedLength + 5].index) # print('df = ') # print(df) timeUnix = list(df['timeUnix']) timeMinutes = list(df['timeMinutes']) measurements = list(df['measurement']) for i in range(len(measurements)): if i < len(measurements) - 30: if measurements[i] + 3 < measurements[i + 28]: print('new record found') df = df.drop( df[df['timeMinutes'] < timeMinutes[i + 28]].index) time_end = find_record_end_from_temp(df) print('time_end = ' + str(time_end)) df = df.drop( df[df['timeMinutes'] > time_end].index) # print('df = ') # print(df) wearable_name = record.split('_') wearable_name = wearable_name[1] recordName = str( str(int(timeUnix[0])) + '_' + str(wearable_name)) print('recordName = ' + str(recordName)) new_record_list.append(recordName) source_path_new.append(record) timeBegin_list.append(int(timeUnix[0])) print('timeUnix[0:20] = ') print(timeUnix[0:20]) timeEnd = min(timeUnix) print('timeEnd = ' + str(timeEnd)) timeEnd = min(timeUnix) + 60 print('timeEnd = ' + str(timeEnd)) timeEnd_list.append(int(timeEnd)) break df_meta_new = pd.DataFrame() df_meta_new['source_path'] = source_path_new df_meta_new['recordBegin'] = timeBegin_list df_meta_new['recordEnd'] = timeEnd_list save_meta(study, df_meta_new)
def format_truncate(): """ timestamp the source """ print("begin format source") study_list = retrieve_ref('study_list') format_types = retrieve_ref('format_types') segment_list = retrieve_ref('segment_list') sensor_list = retrieve_ref('sensor_list') format_type = 'source' segment = 'All' for study in study_list: # print('study = ' + str(study)) df_meta = retrieve_meta(study) source_path = list(df_meta['source_path']) for record in source_path: i = df_meta[df_meta['source_path'] == record].index.values[0] truncatedLength = df_meta.loc[i, 'truncatedLength'] for sensor in sensor_list: df = retrieve_analyzed(study, format_type, record, segment, sensor) df = df.drop(df[df['timeMinutes'] > truncatedLength].index) # create the path to where timestamped data is saved timestamped_path = os.path.join(study) if not os.path.isdir(timestamped_path): os.mkdir(timestamped_path) timestamped_path = os.path.join(timestamped_path, str('formatted')) if not os.path.isdir(timestamped_path): os.mkdir(timestamped_path) timestamped_path = os.path.join(timestamped_path, str('truncate')) if not os.path.isdir(timestamped_path): os.mkdir(timestamped_path) timestamped_path = os.path.join(timestamped_path, str(record)) if not os.path.isdir(timestamped_path): os.mkdir(timestamped_path) timestamped_path = os.path.join(timestamped_path, str(segment)) if not os.path.isdir(timestamped_path): os.mkdir(timestamped_path) timestamped_file = os.path.join(timestamped_path, sensor + ".csv") # print('timestamped_file = ' + str(timestamped_file)) df.to_csv(timestamped_file) print('format truncate file saved = ' + str(timestamped_file))
def statisticSegments(): """ Calculate and save statistics from each record """ print("begin statistical calculation") study_list = retrieve_ref('study_list') sensor_list = retrieve_ref('sensor_list') segment_list = retrieve_ref('segment_list') analysis_type = 'truncate' for study in study_list: df_meta = retrieve_meta(study) source_path = list(df_meta['source_path']) dfStatistics = pd.DataFrame() statistics_types = ['mean', 'median', 'pVariance', 'stdev' 'quan'] quan_types = [10, 20, 30, 40, 50, 60, 70, 80, 90] for record in source_path: dfStatistics['source_path'] = source_path for sensor in sensor_list: for segment in segment_list: for statis in statistics_types: colName = str(sensor + '_' + segment + '_' + statis) if statis == 'quan': for quanNum in quan_types: colName = str(sensor + '_' + segment + '_' + statis + '_' + str(quanNum)) dfStatistics[colName] = [None] * len(source_path) analyzed_path = os.path.join(study, 'analyzed') if not os.path.isdir(analyzed_path): os.mkdir(analyzed_path) analyzed_path = os.path.join(study, 'analyzed', 'statistics') if not os.path.isdir(analyzed_path): os.mkdir(analyzed_path) analyzed_file = os.path.join(analyzed_path, 'statisticsSegments.csv') print('analyzed_file = ' + str(analyzed_file)) dfStatistics.to_csv(analyzed_file) # retrieve statistics file df = pd.read_csv(analyzed_file) for name in list(df.columns): if 'Unnamed' in name: del df[name] for record in source_path: for sensor in sensor_list: for segment in segment_list: df = retrieve_analyzed(study, analysis_type, record, sensor) measurement = list(df['measurement']) for statis in statistics_types: colName = str(sensor + '_' + segment + '_' + statis) valueValue = 'None' if statis == "mean": valueValue = statistics.mean(measurement) if statis == 'median': valueValue = statistics.median(measurement) if statis == 'pvariance': valueValue = statistics.pvariance(measurement) if statis == 'stdev': statistics.stdev(measurement) if statis == 'quan': for quanNum in quan_types: colName = str(sensor + '_' + segment + '_' + statis + '_' + str(quanNum)) valueValue = np.quantile(measurement, quanNum) i = dfStatistics[dfStatistics['source_path'] == record].index.values[0] # print('i = ' + str(i)) dfStatistics.loc[i, colName] = valueValue dfStatistics.to_csv(analyzed_file) print("end statistical calculation")
def statisticsCalculation(): """ Calculate and save statistics from each record """ print("begin statistical calculation") study_list = retrieve_ref('study_list') sensor_list = retrieve_ref('sensor_list') analysis_type = 'truncate' for study in study_list: df_meta = retrieve_meta(study) source_path = list(df_meta['source_path']) dfStatistics = pd.DataFrame() for sensor in sensor_list: dfMeanList, dfMedianList, dfPvariance, dfStdev = [], [], [], [] quan_10, quan_20, quan_30, quan_40, quan_50, quan_60, quan_70, quan_80, quan_90 = [], [], [], [], [], [], [], [], [] for record in source_path: df = retrieve_analyzed(study, analysis_type, record, sensor) measurement = list(df['measurement']) # dfMean = statistics.mean(measurement) # print('dfMean = ' + str(dfMean)) dfMeanList.append(statistics.mean(measurement)) dfMedianList.append(statistics.median(measurement)) dfPvariance.append(statistics.pvariance(measurement)) dfStdev.append(statistics.stdev(measurement)) quan_10.append(np.quantile(measurement, 0.1)) quan_20.append(np.quantile(measurement, 0.2)) quan_30.append(np.quantile(measurement, 0.3)) quan_40.append(np.quantile(measurement, 0.4)) quan_50.append(np.quantile(measurement, 0.5)) quan_60.append(np.quantile(measurement, 0.6)) quan_70.append(np.quantile(measurement, 0.7)) quan_80.append(np.quantile(measurement, 0.8)) quan_90.append(np.quantile(measurement, 0.9)) colName = str(str(sensor) + '_mean') dfStatistics[colName] = dfMeanList colName = str(str(sensor) + '_median') # dfStatistics[colName] = dfMedianList colName = str(str(sensor) + '_pvariance') dfStatistics[colName] = dfPvariance colName = str(str(sensor) + '_stdev') dfStatistics[colName] = dfStdev dfStatistics[str(str(sensor) + 'quan_10')] = quan_10 dfStatistics[str(str(sensor) + 'quan_20')] = quan_20 dfStatistics[str(str(sensor) + 'quan_30')] = quan_30 dfStatistics[str(str(sensor) + 'quan_40')] = quan_40 dfStatistics[str(str(sensor) + 'quan_50')] = quan_50 dfStatistics[str(str(sensor) + 'quan_60')] = quan_60 dfStatistics[str(str(sensor) + 'quan_70')] = quan_70 dfStatistics[str(str(sensor) + 'quan_80')] = quan_80 dfStatistics[str(str(sensor) + 'quan_90')] = quan_90 analyzed_path = os.path.join(study, 'analyzed') if not os.path.isdir(analyzed_path): os.mkdir(analyzed_path) analyzed_path = os.path.join(study, 'analyzed', 'statistics') if not os.path.isdir(analyzed_path): os.mkdir(analyzed_path) analyzed_file = os.path.join(analyzed_path, 'statistics.csv') print('analyzed_file = ' + str(analyzed_file)) dfStatistics.to_csv(analyzed_file) print('statistical analysis for study / sensor complete: ' + str(study) + ' / ' + str(sensor)) plt.scatter(dfStatistics['EDA_mean'], dfStatistics['HR_mean']) plt.xlabel('EDA mean') plt.ylabel('HR mean') plot_path = os.path.join(study, 'plot') if not os.path.isdir(plot_path): os.mkdir(plot_path) plot_path = os.path.join(study, 'plot', 'analyzed') if not os.path.isdir(plot_path): os.mkdir(plot_path) plot_file = os.path.join(plot_path, 'summary' + '.png') plt.savefig(plot_file, bbox_inches='tight') print('saved statistics - ' + str(plot_file)) print("end statistical calculation")
def plot_timestamp(): """ compare the curves to verify the end of the record was properly found plot the source measurements for temperature plot the timestamped data for the temperature plot the truncated data plot the timestamped and truncated on the same plot """ print("begin plotting timestamped data") study_list = retrieve_ref('study_list') sensor_list = retrieve_ref('sensor_list') analysis_list = retrieve_ref('analysis_list') for study in study_list: metadata_path = os.path.join(study, 'meta') metadata_file = os.path.join(metadata_path, 'metadata.csv') df_meta = pd.read_csv(metadata_file) # print(df_meta) # timestamp temp sensor = 'TEMP' for study in study_list: df_meta = retrieve_meta(study) source_path = list(df_meta['source_path']) for record in source_path: row_num, col_num, plot_num = 4, 1, 0 row_width_mulp, col_width_mulp = 14, 5 plot_width, plot_height = col_num*row_width_mulp, row_num*col_width_mulp plt.figure(figsize=(plot_width, plot_height)) # plot the timestamp in unix of timestamped record plot_num += 1 plt.subplot(row_num, col_num, plot_num) analysis_type = 'source' df = retrieve_analyzed(study, analysis_type, record, sensor) valueColor = retrieve_ref_color(str('color_' + str(analysis_type))) plt.scatter(df['count'], df['measurement'], color = valueColor, label = str(analysis_type)) plt.title( analysis_type + ' ' + record + ' ' + sensor) plt.xlabel('Measurement Count - Before Timestamp') sensor_unit = retrieve_sensor_unit(sensor) plt.ylabel(str(sensor) + ' ( ' + str(sensor_unit) + ' )') plt.legend(bbox_to_anchor=(1, 0.5, 0.3, 0.2), loc='upper left') # plot the timestamp in unix of timestamped record plot_num += 1 plt.subplot(row_num, col_num, plot_num) analysis_type = 'timestamp' df = retrieve_analyzed(study, analysis_type, record, sensor) valueColor = retrieve_ref_color(str('color_' + str(analysis_type))) plt.scatter(df['timeMinutes'], df['measurement'], color = valueColor, label = str(analysis_type)) plt.title( analysis_type + ' ' + record + ' ' + sensor) plt.xlabel('Time (Unix)') sensor_unit = retrieve_sensor_unit(sensor) plt.ylabel(str(sensor) + ' ( ' + str(sensor_unit) + ' )') plt.legend(bbox_to_anchor=(1, 0.5, 0.3, 0.2), loc='upper left') # plot both the original and the truncated record plot_num += 1 plt.subplot(row_num, col_num, plot_num) for analysis_type in analysis_list: df = retrieve_analyzed(study, analysis_type, record, sensor) valueColor = retrieve_ref_color(str('color_' + str(analysis_type))) plt.scatter(df['timeMinutes'], df['measurement'], color = valueColor, label = str(analysis_type)) plt.title( analysis_type + ' ' + record + ' ' + sensor) plt.xlabel('Time (minutes)') sensor_unit = retrieve_sensor_unit(sensor) plt.ylabel(str(sensor) + ' ( ' + str(sensor_unit) + ' )') plt.legend(bbox_to_anchor=(1, 0.5, 0.3, 0.2), loc='upper left') # plot the truncated record plot_num += 1 plt.subplot(row_num, col_num, plot_num) analysis_type = 'truncate' df = retrieve_analyzed(study, analysis_type, record, sensor) valueColor = retrieve_ref_color(str('color_' + str(analysis_type))) plt.scatter(df['timeMinutes'], df['measurement'], color = valueColor, label = str(analysis_type)) plt.title( analysis_type + ' ' + record + ' ' + sensor) plt.xlabel('Time (minutes)') sensor_unit = retrieve_sensor_unit(sensor) plt.ylabel(str(sensor) + ' ( ' + str(sensor_unit) + ' )') plt.legend(bbox_to_anchor=(1, 0.5, 0.3, 0.2), loc='upper left') # save the plot plot_path = os.path.join(study, 'plot') if not os.path.isdir(plot_path): os.mkdir(plot_path) plot_path = os.path.join(study, 'plot', 'timestamp') if not os.path.isdir(plot_path): os.mkdir(plot_path) plot_path = os.path.join(study, 'plot', 'timestamp', record) if not os.path.isdir(plot_path): os.mkdir(plot_path) plot_file = os.path.join(plot_path, sensor + '.png') plt.savefig(plot_file, bbox_inches='tight') print("completed plotting timestamped data")
def find_paired_end(): """ Find the end of the paired record Add the end of the coregistered record in the meta file """ print("begin find_paired_end") study_list = retrieve_ref('study_list') format_type = 'truncate' sensor = 'TEMP' segment = 'All' for study in study_list: df_meta = retrieve_meta(study) # print(df_meta) source_path = list(df_meta['source_path']) # recordCoregistered = list(df_meta['recordCoregistered']) df_meta['recordEnd'] = [None] * len(source_path) # there could be two wearables - or one # one wearable was turned off before the other # check if the participant record has one or two wearables # if there are two find the earlier stop time and save to meta file for record in source_path: # find the max value in the "timeUnix' column of analyzed data" df = retrieve_analyzed(study, format_type, record, segment, sensor) timeEndRecord = max(list(df['timeUnix'])) # save that value in the dataframe i = df_meta[df_meta['source_path'] == record].index.values[0] df_meta.loc[i, 'recordEnd'] = int(timeEndRecord) # print('i = ' + str(i)) recordCoregistered = df_meta.loc[i, 'recordCoregistered'] # print('recordCoregistered = ') # print(recordCoregistered) if pd.isnull(df_meta.loc[i, 'recordCoregistered']): print('no pair found') elif len(df_meta.loc[i, 'recordCoregistered']) > 3 + len(record): recordCoregisteredStr = str(df_meta.loc[i, 'recordCoregistered']) recordCoregisteredStrList = recordCoregisteredStr.split(' ') timeEndRecord = [] for recordCoregisteredStr in recordCoregisteredStrList: df = retrieve_analyzed(study, analysis_type, recordCoregisteredStr, sensor) timeEndRecord.append(max(list(df['timeUnix']))) df_meta.loc[i, 'recordEnd'] = int(min(timeEndRecord)) save_meta(study, df_meta) print('df_meta = ') print(df_meta)
def plot_segment(): """ Clean the data """ print("plot segment data") study_list = retrieve_ref('study_list') sensor_list = retrieve_ref('sensor_list') segment_list = retrieve_ref('segment_list') timePreStudy = retrieve_ref('timePreStudy') timePostStudy = retrieve_ref('timePostStudy') for study in study_list: df_meta = retrieve_meta(study) source_path = list(df_meta['source_path']) for record in source_path: row_num, col_num, plot_num = 6, 1, 0 row_width_mulp, col_width_mulp = 14, 5 plot_width, plot_height = col_num * row_width_mulp, row_num * col_width_mulp plt.figure(figsize=(plot_width, plot_height)) for sensor in sensor_list: # plot the timestamp in unix of timestamped record plot_num += 1 plt.subplot(row_num, col_num, plot_num) for segment in segment_list[0:-1]: # print('segment_list') # print(segment_list) analysis_type = segment df = retrieve_analyzed(study, analysis_type, record, sensor) # print(df) valueColor = retrieve_ref_color( str('color_' + str(segment))) plt.scatter(df['timeMinutes'], df['measurement'], color=valueColor, label=str(segment)) plt.title(analysis_type + ' ' + record + ' ' + sensor) plt.xlabel('Measurement Count - Before Timestamp') sensor_unit = retrieve_sensor_unit(sensor) plt.ylabel(str(sensor) + ' ( ' + str(sensor_unit) + ' )') plt.legend(bbox_to_anchor=(1, 0.5, 0.3, 0.2), loc='upper left') # save the plot plot_path = os.path.join(study, 'plot') if not os.path.isdir(plot_path): os.mkdir(plot_path) plot_path = os.path.join(study, 'plot', 'segment') if not os.path.isdir(plot_path): os.mkdir(plot_path) plot_path = os.path.join(study, 'plot', 'segment', record) if not os.path.isdir(plot_path): os.mkdir(plot_path) plot_file = os.path.join(plot_path, sensor + '.png') plt.savefig(plot_file, bbox_inches='tight')
def plot_acc(): """ compare the curves to verify the end of the record was properly found plot the source measurements for temperature plot the timestamped data for the temperature plot the truncated data plot the timestamped and truncated on the same plot """ print("begin plotting acc data") study_list = retrieve_ref('study_list') sensor_list = retrieve_ref('sensor_list') analysis_list = retrieve_ref('analysis_list') for study in study_list: metadata_path = os.path.join(study, 'meta') metadata_file = os.path.join(metadata_path, 'metadata.csv') df_meta = pd.read_csv(metadata_file) # print(df_meta) # timestamp ACC sensor = 'ACC' for study in study_list: df_meta = retrieve_meta(study) source_path = list(df_meta['source_path']) for record in source_path: row_num, col_num, plot_num = 5, 1, 0 row_width_mulp, col_width_mulp = 20, 5 plot_width, plot_height = col_num * row_width_mulp, row_num * col_width_mulp plt.figure(figsize=(plot_width, plot_height)) analysis_type = 'truncate' df = retrieve_analyzed(study, analysis_type, record, sensor) for name in list(df.columns): if 'time' not in name: plot_num += 1 plt.subplot(row_num, col_num, plot_num) colorScatter = valueColor = retrieve_ref_color( str('color_' + str(sensor) + '_' + str(name))) plt.scatter(df['timeMinutes'], df[name], color=colorScatter, label=str(name)) # plt.scatter(df['timeMinutes'], df['measurement'], label = str('vector')) plt.title(analysis_type + ' ' + record + ' ' + sensor) plt.xlabel('Time (Minutes)') plt.ylabel(str(sensor + ' ' + name)) plt.xlim([0, 1.02 * max(list(df['timeMinutes']))]) plt.legend(bbox_to_anchor=(1, 0.5, 0.3, 0.2), loc='upper left') plot_num += 1 plt.subplot(row_num, col_num, plot_num) for name in list(df.columns): if 'time' not in name: colorScatter = valueColor = retrieve_ref_color( str('color_' + str(sensor) + '_' + str(name))) plt.scatter(df['timeMinutes'], df[name], color=colorScatter, label=str(name)) plt.title(analysis_type + ' ' + record + ' ' + sensor + ' ' + name) plt.xlabel('Time (Minutes)') sensor_unit = retrieve_sensor_unit(sensor) plt.ylabel( str(sensor + ' ' + name + ' ( ' + str(sensor_unit) + ' )')) plt.xlim([0, 1.02 * max(list(df['timeMinutes']))]) plt.legend(bbox_to_anchor=(1, 0.5, 0.3, 0.2), loc='upper left') # save the plot plot_path = os.path.join(study, 'plot') if not os.path.isdir(plot_path): os.mkdir(plot_path) plot_path = os.path.join(study, 'plot', 'timestamp') if not os.path.isdir(plot_path): os.mkdir(plot_path) plot_path = os.path.join(study, 'plot', 'timestamp', record) if not os.path.isdir(plot_path): os.mkdir(plot_path) plot_file = os.path.join(plot_path, sensor + '.png') plt.savefig(plot_file, bbox_inches='tight') print('saved plotted acc figure - ' + str(plot_file)) print("completed plotting acc data")