def get_status1_vs_status2_statistic(dataset_name, args): """ Get status2 statistic group by the status1 group. @param dataset_name: the list of dataset name for statistic @param args: contain the statistic field status1 and status2 and the group information @return: the json result of the query """ status1, status2, group = args statistic_result = dict() for name in dataset_name: dataset = db[name] for query_result in query_field(dataset, field = {"_id": 0, status1: 1, status2: 1}): # make sure the query result has field status1 and status2 if status1 not in query_result or status2 not in query_result: continue # get the query value's group result key = get_group(group, query_result[status1]) if key not in statistic_result: statistic_result[key] = [0, 0] # append the status2 query result into status1 group result for status in query_result[status2]: statistic_result[key][0] += float(status) statistic_result[key][1] += 1 # return the histogram result return get_histogram_json({key: statistic_result[key][0] / statistic_result[key][1] for key in statistic_result})
def get_gl_plot(dataset_name, args): """ Get mean glucose value. The glucose value will statistic every time interval and cycle every time cycle. This function also can be used for numeric list style field with timestamp with simple modify. @param dataset_name: the list of dataset name for statistic @param args: contain the time interval and time cycle @return: the json result of the query """ interval, cycle = args # get time interval and time cycle value time_interval, time_cycle = get_time_from_str(interval), get_time_from_str(cycle) # get hour digit for consistent format hour_digit = int(math.log(time_cycle / time_hour, 10)) + 1 # prepare the processed timestamp based on the time interval and time cycle timestamp_trans_list = [int_to_timestamp(i // time_interval * time_interval, hour_digit = hour_digit) for i in range(time_cycle)] statistic_result = defaultdict(lambda: [0, 0]) for name in dataset_name: for query_result in query_field(db[name], field = {"_id": 0, "gl": 1}): if "gl" not in query_result: continue for gl, datetime in query_result["gl"]: datetime_trans = timestamp_trans_list[datetime % time_cycle] statistic_result[datetime_trans][0] += float(gl) statistic_result[datetime_trans][1] += 1 # get the query result from statistic result result = {timestamp: statistic_result[timestamp][0] / statistic_result[timestamp][1] for timestamp in statistic_result} # return the plot result return get_plot_json(result)
def get_list_status_statistic(dataset_name, args): """ Statistic the proportion of different status. The status is store in list style. @param dataset_name: the list of dataset name for statistic @param args: contain the statistic field status and the group information @return: the json result of the query """ status, group = args statistic_result = dict() for name in dataset_name: dataset = db[name] for query_result in query_field(dataset, field = {"_id": 0, status: 1}): # make sure the query result has field status if status not in query_result: continue for result in query_result[status]: # if the result contains timestamp, we don't need timestamp here and just get the content if isinstance(result, list): result = result[0] # get the query value's group result key = get_group(group, result) if key not in statistic_result: statistic_result[key] = 0 statistic_result[key] += 1 # return the pie plot result return get_pie_plot_json({key: statistic_result[key] for key in statistic_result})
def get_mult_field_status_statistic(dataset_name, args): """ Statistic the proportion of different status This function will get the sum of differnt field in a period of time, group the sum. Adhoc for get_daily_insulin_time_vs_gl. @param dataset_name: the list of dataset name for statistic @param args: contain the statistic fields status and the group information @return: the json result of the query """ fields, group, period = args # get the time period and query field time_period = get_time_from_str(period) statistic_result, field = defaultdict(lambda: 0), {f: 1 for f in fields} field["patient_id"] = 1 for name in dataset_name: # statistic based on the patient id. For get the patient mean status result = defaultdict(lambda: defaultdict(lambda: 0)) for query_result in query_field(db[name], field = field): if "patient_id" not in query_result: continue patient_id = query_result["patient_id"] for f in fields: if f not in query_result: continue for r, datetime in query_result[f]: result[patient_id][datetime // time_period] += float(r) # get the statistic result based on the patient mean status for patient_id in result: mean_status = sum(result[patient_id][t] for t in result[patient_id]) / len(result[patient_id]) key = get_group(group, mean_status) statistic_result[key] += 1 # return the pie plot result return get_pie_plot_json({key: statistic_result[key] for key in statistic_result})
def get_status_statistic(dataset_name: list, args: tuple): """ Statistic the proportion of different status @param dataset_name: the list of dataset name for statistic @param args: contain the statistic field status and the group information @return: the json result of the query """ status, group = args statistic_result = dict() for name in dataset_name: for query_result in query_field(db[name], field = {"_id": 0, status: 1}): if status in query_result: # get the query value's group result key = get_group(group, query_result[status]) if key not in statistic_result: statistic_result[key] = 0 statistic_result[key] += 1 # return the pie plot result return get_pie_plot_json({key: statistic_result[key] for key in statistic_result})
def get_status_vs_gl(dataset_name, args): """ Get mean glucose value based on the status group result. The glucose value will statistic every time interval and cycle every time cycle. @param dataset_name: the list of dataset name for statistic @param args: contain the statistic field status, time interval, time cycle and the group information @return: the json result of the query """ status, group, interval, cycle = args # get time interval and time cycle value time_interval, time_cycle = get_time_from_str(interval), get_time_from_str(cycle) # get hour digit for consistent format hour_digit = int(math.log(time_cycle / time_hour, 10)) + 1 # prepare the processed timestamp based on the time interval and time cycle timestamp_trans_list = [int_to_timestamp(i // time_interval * time_interval, hour_digit = hour_digit) for i in range(time_cycle)] statistic_result = dict() for name in dataset_name: # for every dataset for query_result in query_field(db[name], field = {"_id": 0, status: 1, "gl": 1}): if status in query_result and "gl" in query_result: # get the query value's group result key = get_group(group, query_result[status]) if key not in statistic_result: statistic_result[key] = defaultdict(lambda: [0, 0]) # append query glucose value into status group result for gl, datetime in query_result["gl"]: datetime_trans = timestamp_trans_list[datetime % time_cycle] statistic_result[key][datetime_trans][0] += float(gl) statistic_result[key][datetime_trans][1] += 1 # get the query result from statistic result result = {status: {timestamp: statistic_result[status][timestamp][0] / statistic_result[status][timestamp][1] for timestamp in statistic_result[status]} for status in statistic_result} # return the mult plot result return get_mult_plot_json(result)
def get_exercise_gl_plot(dataset_name, args): """ Get mean exercise_gl value. The exercise_gl value will statistic every time interval and cycle every time cycle. @param dataset_name: the list of dataset name for statistic @param args: contain the certain exercise status, time interval and time cycle @return: the json result of the query """ exercise_status, interval, cycle = args # get time interval and time cycle value time_interval, time_cycle = get_time_from_str(interval), get_time_from_str(cycle) # get hour digit for consistent format hour_digit = int(math.log(time_cycle / time_hour, 10)) + 1 # prepare the processed timestamp based on the time interval and time cycle timestamp_trans_list = [int_to_timestamp(i // time_interval * time_interval, hour_digit = hour_digit) for i in range(time_cycle)] statistic_result = defaultdict(lambda: [0, 0]) for name in dataset_name: for query_result in query_field(db[name], field = {"_id": 0, "exercise_gl": 1, "exercise_status": 1}): if "exercise_gl" not in query_result or "exercise_status" not in query_result: continue # ASSUMPTION: the exercise glucose and exerices status is one-one related. for idx, (exercise_gl, datetime) in enumerate(query_result["exercise_gl"]): if query_result["exercise_status"][idx] != exercise_status: continue datetime_trans = timestamp_trans_list[datetime % time_cycle] statistic_result[datetime_trans][0] += float(exercise_gl) statistic_result[datetime_trans][1] += 1 # get the query result from statistic result result = {timestamp: statistic_result[timestamp][0] / statistic_result[timestamp][1] for timestamp in statistic_result} # return the plot result return get_plot_json(result)
def preprocess_dataset(dataset_info): """ Preprocess the file information and insert into the database. The file type could be csv, txt and xls. The file information should hardcord in the config file. The function allow a increnmental way adding information to the database. @param dataset_info: The preprocess dataset information list. Each item in the list is a dictionary which contain the dataset name and all the insert task names. The insert task name should define in the config. @return: None """ for info in dataset_info: dataset_name, insert_tasks = info["dataset"], info["insert_tasks"] # get dataset preprocess config and basic information config = get_preprocess_config(dataset_name, insert_tasks) print("dataset: ", dataset_name) dataset = db[dataset_name] # delete all the data in the current dataset, may uncomment when developing # delete_all_date(dataset) # get all the patient id in the current dataset all_patient_id = { patient_id["patient_id"] for patient_id in query_field(dataset, field={ "_id": 0, "patient_id": 1 }) } # get the raw data for increnmental adding raw_data = { result["patient_id"]: { field: result[field] for field in result if field != "patient_id" } for result in query_field(dataset) } data = defaultdict(lambda: dict()) # for each sub dataset task for insert_task in insert_tasks: # get sub dataset basic information filenames = config[insert_task]["filename"] fields = config[insert_task]["select_column"] # ASSUMPTION: all the insert task has field patient_id and the meaning is the same. # D1NAMO break the assumption and will adhoc get the patient id from file name. patient_idx = sum( [i for i in range(len(fields)) if fields[i] == "patient_id"]) for filename in filenames: # get the file real path file = os.path.join( os.path.join(config["root_dir"], config["dataset"]), filename) print("processing file", file) # ASSUMPTION: all the file type in the insert task is the same. # get the file reader and line count if config[insert_task]["file_type"] == "xls": cnt = line_count_xls(file) readable = Reader( xlrd.open_workbook(file).sheets()[0], config[insert_task]["file_type"]) # file type is txt or csv else: cnt, readable = line_count(file), Reader( open(file), config[insert_task]["file_type"]) # use tqdm to show the process progress with tqdm(total=cnt) as bar: for line_cnt in range(cnt): # get file content line = readable.readline() # if the line is not the header if line_cnt != 0: # get patient_id if dataset_name == "D1NAMO": patient_id = int(file.split("/")[-2]) else: patient_id = str(int(float(line[patient_idx]))) # if the patient id is not in the dataset, add this patient to the database. if patient_id not in all_patient_id: insert_one_data(dataset, {"patient_id": patient_id}) all_patient_id.add(patient_id) # get line timestamp. if there is no timestamp, it will be 0 timestamp = 0 if "datetime" in fields: timestamp += sum( datetime_to_int( line[i], config[insert_task] ["basedate"], config[insert_task] ["pattern"]) for i in range(len(fields)) if fields[i] == "datetime") else: if "date" in fields: timestamp += sum( date_to_int( line[i], config[insert_task] ["basedate"], config[insert_task] ["pattern"]) for i in range(len(fields)) if fields[i] == "date") if "timestamp" in fields: timestamp += sum( timestamp_to_int( line[i], config[insert_task] ["pattern"]) for i in range(len(fields)) if fields[i] == "timestamp") row_combine_field = dict() for idx in range(len(line)): if idx >= len(line): continue content, field = line[idx], config[ insert_task]["select_column"][idx] # if the field should not append or there is no content in the line, continue if field == '' or len(content) == 0: continue # if the field is patient_id or timestamp related, continue if field in { "patient_id", "datetime", "date", "timestamp" }: continue # if the field is a status, the field content will not store in list style. if field in status_field_set: # adhoc for field trouble_sleep_inverse if field == "trouble_sleep_inverse": data[patient_id][ "trouble_sleep"] = str( 5 - int(content)) # adhoc for field low_gl elif field == "low_gl": data[patient_id][ "low_gl"] = content.split(" ")[0] else: data[patient_id][field] = content # adhoc for field weight_units (weight should in data before) elif field == "weight_units": if content == "lbs": data[patient_id]["weight"] = str( LBS_TO_KG * float(data[patient_id]["weight"])) # if the field is need store with timestamp elif field in timestamp_field_set: # adhoc for field raw_gl if field == "raw_gl": content = str(float(content) * 18) field = "gl" # if field not in patient's data, initial from raw data in database if field not in data[patient_id]: data[patient_id][field] = \ list() if patient_id not in raw_data or field not in raw_data[patient_id] \ else raw_data[patient_id][field] # append the content with timestamp data[patient_id][field].append( [content, timestamp]) # if the field needs to combine to another field elif field in combine_field_set: combine_field = combine_field_set[field] if combine_field not in row_combine_field: row_combine_field[combine_field] = 0 row_combine_field[combine_field] += float( content) # for the common field, store in list style else: # if field not in patient's data, initial from raw data in database if field not in data[patient_id]: data[patient_id][field] = \ list() if patient_id not in raw_data or field not in raw_data[patient_id] \ else raw_data[patient_id][field] data[patient_id][field].append(content) # ASSUMPTION: the combine field is the common field (not status or store with timestamp) for field in row_combine_field: if field not in data[patient_id]: data[patient_id][field] = list() data[patient_id][field].append( str(row_combine_field[field])) # update the progress bar bar.update() # insert the preprocessed data to the database print("start to insert data to:", dataset_name) start = time.clock() for patient_id in data: for field in data[patient_id]: # update the field in the database update_data(dataset, {"patient_id": patient_id}, {'$set': { field: data[patient_id][field] }}) print("use time to insert:", time.clock() - start)
def get_time_diff_list_status_vs_gl(dataset_name, args): """ Get mean glucose value based on the status result. The status should also store with timestamp. The function will statistic the glucose value when a status comes and before the next status comes. The glucose value will statistic every time interval and statistic at most time max duration @param dataset_name: the list of dataset name for statistic @param args: contain the statistic field status, time interval and time max duration. Currently does not has group information. @return: the json result of the query """ status, interval, max_duration = args # get time interval and time cycle value time_interval, time_max_duration = get_time_from_str(interval), get_time_from_str(max_duration) # get hour digit for consistent format hour_digit = int(math.log(time_max_duration / time_hour, 10)) + 1 # prepare the processed timestamp based on the time interval and time cycle timestamp_trans_list = [int_to_timestamp(i // time_interval * time_interval, hour_digit = hour_digit) for i in range(time_max_duration)] # statistic the query result gl_dict, status_dict = dict(), dict() for name in dataset_name: for query_result in query_field(db[name], field = {"_id": 0, "patient_id": 1, status: 1, "gl": 1}): if "patient_id" in query_result and "gl" in query_result and status in query_result: patient_id = query_result["patient_id"] # add patient id to status_dict and gl_dict if patient_id not in status_dict: status_dict[patient_id] = list() if patient_id not in gl_dict: gl_dict[patient_id] = list() # append query glucose value and datetime into status_dict and gl_dict for status_result, datetime in query_result[status]: status_dict[patient_id].append(int(datetime)) for gl_result, datetime in query_result["gl"]: gl_dict[patient_id].append((int(datetime), float(gl_result))) # for each patient sort the result for faster time matching for patient_id in status_dict: status_dict[patient_id].sort() for patient_id in gl_dict: gl_dict[patient_id].sort(key = lambda x: x[0]) # matching result statistic_result = dict() for patient_id in status_dict: # patient should also has glucose record if patient_id not in gl_dict: continue # get the patient's status and glucose data status_list, gl_list = status_dict[patient_id], gl_dict[patient_id] status_idx, gl_idx = 0, 0 while status_idx < len(status_list): # set a sliding window to find match datetime, the valid glucose datetime should in the sliding window. left = status_list[status_idx] right = min(status_list[status_idx] + time_max_duration, float("inf") if status_idx >= len(status_list) - 1 else status_list[status_idx + 1]) # ensure the glucose datetime bigger than the window's left side while gl_idx < len(gl_list) and left > gl_list[gl_idx][0]: gl_idx += 1 while gl_idx < len(gl_list): # if the glucose datetime smaller than the windows's right side if right > gl_list[gl_idx][0]: # get the datetime diff diff = timestamp_trans_list[gl_list[gl_idx][0] - left] if diff not in statistic_result: statistic_result[diff] = [0, 0] # append the glucose result into the datetime diff statistic_result[diff][0] += float(gl_list[gl_idx][1]) statistic_result[diff][1] += 1 # check for next glucose result gl_idx += 1 # if the glucose datetime bigger than the windows's right side, we need to update the window else: break # get next status status_idx += 1 # get the query result from statistic result result = {timestamp: statistic_result[timestamp][0] / statistic_result[timestamp][1] for timestamp in statistic_result} # return the plot result return get_plot_json(result)