def get_status1_vs_status2_statistic(dataset_name, args):
	"""
	Get status2 statistic group by the status1 group.
	@param dataset_name: the list of dataset name for statistic
	@param args: contain the statistic field status1 and status2 and the group information
	@return: the json result of the query
	"""
	status1, status2, group = args
	statistic_result = dict()
	for name in dataset_name:
		dataset = db[name]
		for query_result in query_field(dataset, field = {"_id": 0, status1: 1, status2: 1}):
			# make sure the query result has field status1 and status2
			if status1 not in query_result or status2 not in query_result: continue

			# get the query value's group result
			key = get_group(group, query_result[status1])
			if key not in statistic_result: statistic_result[key] = [0, 0]

			# append the status2 query result into status1 group result
			for status in query_result[status2]:
				statistic_result[key][0] += float(status)
				statistic_result[key][1] += 1

	# return the histogram result
	return get_histogram_json({key: statistic_result[key][0] / statistic_result[key][1] for key in statistic_result})
def get_gl_plot(dataset_name, args):
	"""
	Get mean glucose value. The glucose value will statistic every time interval and cycle every time cycle.
	This function also can be used for numeric list style field with timestamp with simple modify.
	@param dataset_name: the list of dataset name for statistic
	@param args: contain the time interval and time cycle
	@return: the json result of the query
	"""
	interval, cycle = args

	# get time interval and time cycle value
	time_interval, time_cycle = get_time_from_str(interval), get_time_from_str(cycle)

	# get hour digit for consistent format
	hour_digit = int(math.log(time_cycle / time_hour, 10)) + 1
	
	# prepare the processed timestamp based on the time interval and time cycle
	timestamp_trans_list = [int_to_timestamp(i // time_interval * time_interval, hour_digit = hour_digit)
	                        for i in range(time_cycle)]

	statistic_result = defaultdict(lambda: [0, 0])
	for name in dataset_name:
		for query_result in query_field(db[name], field = {"_id": 0, "gl": 1}):
			if "gl" not in query_result: continue
			for gl, datetime in query_result["gl"]:
				datetime_trans = timestamp_trans_list[datetime % time_cycle]
				statistic_result[datetime_trans][0] += float(gl)
				statistic_result[datetime_trans][1] += 1

	# get the query result from statistic result
	result = {timestamp: statistic_result[timestamp][0] / statistic_result[timestamp][1]
	          for timestamp in statistic_result}

	# return the plot result
	return get_plot_json(result)
def get_list_status_statistic(dataset_name, args):
	"""
	Statistic the proportion of different status.
	The status is store in list style.
	@param dataset_name: the list of dataset name for statistic
	@param args: contain the statistic field status and the group information
	@return: the json result of the query
	"""
	status, group = args
	statistic_result = dict()
	for name in dataset_name:
		dataset = db[name]
		for query_result in query_field(dataset, field = {"_id": 0, status: 1}):
			# make sure the query result has field status
			if status not in query_result: continue
			for result in query_result[status]:
				# if the result contains timestamp, we don't need timestamp here and just get the content
				if isinstance(result, list): result = result[0]

				# get the query value's group result
				key = get_group(group, result)
				if key not in statistic_result: statistic_result[key] = 0
				statistic_result[key] += 1

	# return the pie plot result
	return get_pie_plot_json({key: statistic_result[key] for key in statistic_result})
def get_mult_field_status_statistic(dataset_name, args):
	"""
	Statistic the proportion of different status
	This function will get the sum of differnt field in a period of time, group the sum.
	Adhoc for get_daily_insulin_time_vs_gl.
	@param dataset_name: the list of dataset name for statistic
	@param args: contain the statistic fields status and the group information
	@return: the json result of the query
	"""
	fields, group, period = args

	# get the time period and query field
	time_period = get_time_from_str(period)
	statistic_result, field = defaultdict(lambda: 0), {f: 1 for f in fields}
	field["patient_id"] = 1

	for name in dataset_name:
		# statistic based on the patient id. For get the patient mean status
		result = defaultdict(lambda: defaultdict(lambda: 0))
		for query_result in query_field(db[name], field = field):
			if "patient_id" not in query_result: continue
			patient_id = query_result["patient_id"]
			for f in fields:
				if f not in query_result: continue
				for r, datetime in query_result[f]: result[patient_id][datetime // time_period] += float(r)

		# get the statistic result based on the patient mean status
		for patient_id in result:
			mean_status = sum(result[patient_id][t] for t in result[patient_id]) / len(result[patient_id])
			key = get_group(group, mean_status)
			statistic_result[key] += 1

	# return the pie plot result
	return get_pie_plot_json({key: statistic_result[key] for key in statistic_result})
def get_status_statistic(dataset_name: list, args: tuple):
	"""
	Statistic the proportion of different status
	@param dataset_name: the list of dataset name for statistic
	@param args: contain the statistic field status and the group information
	@return: the json result of the query
	"""
	status, group = args
	statistic_result = dict()
	for name in dataset_name:
		for query_result in query_field(db[name], field = {"_id": 0, status: 1}):
			if status in query_result:
				# get the query value's group result
				key = get_group(group, query_result[status])
				if key not in statistic_result: statistic_result[key] = 0
				statistic_result[key] += 1

	# return the pie plot result
	return get_pie_plot_json({key: statistic_result[key] for key in statistic_result})
def get_status_vs_gl(dataset_name, args):
	"""
	Get mean glucose value based on the status group result.
	The glucose value will statistic every time interval and cycle every time cycle.
	@param dataset_name: the list of dataset name for statistic
	@param args: contain the statistic field status, time interval, time cycle and the group information
	@return: the json result of the query
	"""
	status, group, interval, cycle = args

	# get time interval and time cycle value
	time_interval, time_cycle = get_time_from_str(interval), get_time_from_str(cycle)

	# get hour digit for consistent format
	hour_digit = int(math.log(time_cycle / time_hour, 10)) + 1
	
	# prepare the processed timestamp based on the time interval and time cycle
	timestamp_trans_list = [int_to_timestamp(i // time_interval * time_interval, hour_digit = hour_digit)
	                        for i in range(time_cycle)]

	statistic_result = dict()
	for name in dataset_name:
		# for every dataset
		for query_result in query_field(db[name], field = {"_id": 0, status: 1, "gl": 1}):
			if status in query_result and "gl" in query_result:
				# get the query value's group result
				key = get_group(group, query_result[status])
				if key not in statistic_result: statistic_result[key] = defaultdict(lambda: [0, 0])

				# append query glucose value into status group result
				for gl, datetime in query_result["gl"]:
					datetime_trans = timestamp_trans_list[datetime % time_cycle]
					statistic_result[key][datetime_trans][0] += float(gl)
					statistic_result[key][datetime_trans][1] += 1

	# get the query result from statistic result
	result = {status: {timestamp: statistic_result[status][timestamp][0] / statistic_result[status][timestamp][1]
	                   for timestamp in statistic_result[status]} for status in statistic_result}
	
	# return the mult plot result
	return get_mult_plot_json(result)
def get_exercise_gl_plot(dataset_name, args):
	"""
	Get mean exercise_gl value. The exercise_gl value will statistic every time interval and cycle every time cycle.
	@param dataset_name: the list of dataset name for statistic
	@param args: contain the certain exercise status, time interval and time cycle
	@return: the json result of the query
	"""
	exercise_status, interval, cycle = args

	# get time interval and time cycle value
	time_interval, time_cycle = get_time_from_str(interval), get_time_from_str(cycle)

	# get hour digit for consistent format
	hour_digit = int(math.log(time_cycle / time_hour, 10)) + 1
	
	# prepare the processed timestamp based on the time interval and time cycle
	timestamp_trans_list = [int_to_timestamp(i // time_interval * time_interval, hour_digit = hour_digit)
	                        for i in range(time_cycle)]

	statistic_result = defaultdict(lambda: [0, 0])
	for name in dataset_name:
		for query_result in query_field(db[name], field = {"_id": 0, "exercise_gl": 1, "exercise_status": 1}):
			if "exercise_gl" not in query_result or "exercise_status" not in query_result: continue
			
			# ASSUMPTION: the exercise glucose and exerices status is one-one related.
			for idx, (exercise_gl, datetime) in enumerate(query_result["exercise_gl"]):
				if query_result["exercise_status"][idx] != exercise_status: continue
				datetime_trans = timestamp_trans_list[datetime % time_cycle]
				statistic_result[datetime_trans][0] += float(exercise_gl)
				statistic_result[datetime_trans][1] += 1

	# get the query result from statistic result
	result = {timestamp: statistic_result[timestamp][0] / statistic_result[timestamp][1]
	          for timestamp in statistic_result}

	# return the plot result
	return get_plot_json(result)
def preprocess_dataset(dataset_info):
    """
	Preprocess the file information and insert into the database.
	The file type could be csv, txt and xls.
	The file information should hardcord in the config file.
	The function allow a increnmental way adding information to the database.
	@param dataset_info: The preprocess dataset information list. Each item in the list is a dictionary which contain 
						 the dataset name and all the insert task names. The insert task name should define in the config.
	@return: None
	"""
    for info in dataset_info:
        dataset_name, insert_tasks = info["dataset"], info["insert_tasks"]

        # get dataset preprocess config and basic information
        config = get_preprocess_config(dataset_name, insert_tasks)
        print("dataset: ", dataset_name)
        dataset = db[dataset_name]

        # delete all the data in the current dataset, may uncomment when developing
        # delete_all_date(dataset)

        # get all the patient id in the current dataset
        all_patient_id = {
            patient_id["patient_id"]
            for patient_id in query_field(dataset,
                                          field={
                                              "_id": 0,
                                              "patient_id": 1
                                          })
        }

        # get the raw data for increnmental adding
        raw_data = {
            result["patient_id"]: {
                field: result[field]
                for field in result if field != "patient_id"
            }
            for result in query_field(dataset)
        }
        data = defaultdict(lambda: dict())

        # for each sub dataset task
        for insert_task in insert_tasks:
            # get sub dataset basic information
            filenames = config[insert_task]["filename"]
            fields = config[insert_task]["select_column"]

            # ASSUMPTION: all the insert task has field patient_id and the meaning is the same.
            #             D1NAMO break the assumption and will adhoc get the patient id from file name.
            patient_idx = sum(
                [i for i in range(len(fields)) if fields[i] == "patient_id"])

            for filename in filenames:
                # get the file real path
                file = os.path.join(
                    os.path.join(config["root_dir"], config["dataset"]),
                    filename)
                print("processing file", file)

                # ASSUMPTION: all the file type in the insert task is the same.
                # get the file reader and line count
                if config[insert_task]["file_type"] == "xls":
                    cnt = line_count_xls(file)
                    readable = Reader(
                        xlrd.open_workbook(file).sheets()[0],
                        config[insert_task]["file_type"])
                # file type is txt or csv
                else:
                    cnt, readable = line_count(file), Reader(
                        open(file), config[insert_task]["file_type"])

                # use tqdm to show the process progress
                with tqdm(total=cnt) as bar:
                    for line_cnt in range(cnt):
                        # get file content
                        line = readable.readline()

                        # if the line is not the header
                        if line_cnt != 0:
                            # get patient_id
                            if dataset_name == "D1NAMO":
                                patient_id = int(file.split("/")[-2])
                            else:
                                patient_id = str(int(float(line[patient_idx])))

                            # if the patient id is not in the dataset, add this patient to the database.
                            if patient_id not in all_patient_id:
                                insert_one_data(dataset,
                                                {"patient_id": patient_id})
                                all_patient_id.add(patient_id)

                            # get line timestamp. if there is no timestamp, it will be 0
                            timestamp = 0
                            if "datetime" in fields:
                                timestamp += sum(
                                    datetime_to_int(
                                        line[i], config[insert_task]
                                        ["basedate"], config[insert_task]
                                        ["pattern"])
                                    for i in range(len(fields))
                                    if fields[i] == "datetime")
                            else:
                                if "date" in fields:
                                    timestamp += sum(
                                        date_to_int(
                                            line[i], config[insert_task]
                                            ["basedate"], config[insert_task]
                                            ["pattern"])
                                        for i in range(len(fields))
                                        if fields[i] == "date")
                                if "timestamp" in fields:
                                    timestamp += sum(
                                        timestamp_to_int(
                                            line[i], config[insert_task]
                                            ["pattern"])
                                        for i in range(len(fields))
                                        if fields[i] == "timestamp")

                            row_combine_field = dict()
                            for idx in range(len(line)):
                                if idx >= len(line): continue
                                content, field = line[idx], config[
                                    insert_task]["select_column"][idx]

                                # if the field should not append or there is no content in the line, continue
                                if field == '' or len(content) == 0: continue

                                # if the field is patient_id or timestamp related, continue
                                if field in {
                                        "patient_id", "datetime", "date",
                                        "timestamp"
                                }:
                                    continue

                                # if the field is a status, the field content will not store in list style.
                                if field in status_field_set:
                                    # adhoc for field trouble_sleep_inverse
                                    if field == "trouble_sleep_inverse":
                                        data[patient_id][
                                            "trouble_sleep"] = str(
                                                5 - int(content))
                                    # adhoc for field low_gl
                                    elif field == "low_gl":
                                        data[patient_id][
                                            "low_gl"] = content.split(" ")[0]
                                    else:
                                        data[patient_id][field] = content
                                # adhoc for field weight_units (weight should in data before)
                                elif field == "weight_units":
                                    if content == "lbs":
                                        data[patient_id]["weight"] = str(
                                            LBS_TO_KG *
                                            float(data[patient_id]["weight"]))
                                # if the field is need store with timestamp
                                elif field in timestamp_field_set:
                                    # adhoc for field raw_gl
                                    if field == "raw_gl":
                                        content = str(float(content) * 18)
                                        field = "gl"

                                    # if field not in patient's data, initial from raw data in database
                                    if field not in data[patient_id]:
                                        data[patient_id][field] = \
                                         list() if patient_id not in raw_data or field not in raw_data[patient_id] \
                                             else raw_data[patient_id][field]

                                    # append the content with timestamp
                                    data[patient_id][field].append(
                                        [content, timestamp])
                                # if the field needs to combine to another field
                                elif field in combine_field_set:
                                    combine_field = combine_field_set[field]
                                    if combine_field not in row_combine_field:
                                        row_combine_field[combine_field] = 0
                                    row_combine_field[combine_field] += float(
                                        content)
                                # for the common field, store in list style
                                else:
                                    # if field not in patient's data, initial from raw data in database
                                    if field not in data[patient_id]:
                                        data[patient_id][field] = \
                                         list() if patient_id not in raw_data or field not in raw_data[patient_id] \
                                             else raw_data[patient_id][field]
                                    data[patient_id][field].append(content)

                            # ASSUMPTION: the combine field is the common field (not status or store with timestamp)
                            for field in row_combine_field:
                                if field not in data[patient_id]:
                                    data[patient_id][field] = list()
                                data[patient_id][field].append(
                                    str(row_combine_field[field]))

                        # update the progress bar
                        bar.update()

        # insert the preprocessed data to the database
        print("start to insert data to:", dataset_name)
        start = time.clock()
        for patient_id in data:
            for field in data[patient_id]:
                # update the field in the database
                update_data(dataset, {"patient_id": patient_id},
                            {'$set': {
                                field: data[patient_id][field]
                            }})
        print("use time to insert:", time.clock() - start)
def get_time_diff_list_status_vs_gl(dataset_name, args):
	"""
	Get mean glucose value based on the status result. The status should also store with timestamp.
	The function will statistic the glucose value when a status comes and before the next status comes.
	The glucose value will statistic every time interval and statistic at most time max duration
	@param dataset_name: the list of dataset name for statistic
	@param args: contain the statistic field status, time interval and time max duration.
	             Currently does not has group information.
	@return: the json result of the query
	"""
	status, interval, max_duration = args
	
	# get time interval and time cycle value
	time_interval, time_max_duration = get_time_from_str(interval), get_time_from_str(max_duration)

	# get hour digit for consistent format
	hour_digit = int(math.log(time_max_duration / time_hour, 10)) + 1
	
	# prepare the processed timestamp based on the time interval and time cycle
	timestamp_trans_list = [int_to_timestamp(i // time_interval * time_interval, hour_digit = hour_digit)
	                        for i in range(time_max_duration)]

	# statistic the query result
	gl_dict, status_dict = dict(), dict()
	for name in dataset_name:
		for query_result in query_field(db[name], field = {"_id": 0, "patient_id": 1, status: 1, "gl": 1}):
			if "patient_id" in query_result and "gl" in query_result and status in query_result:
				patient_id = query_result["patient_id"]
				
				# add patient id to status_dict and gl_dict
				if patient_id not in status_dict: status_dict[patient_id] = list()
				if patient_id not in gl_dict: gl_dict[patient_id] = list()
				
				# append query glucose value and datetime into status_dict and gl_dict
				for status_result, datetime in query_result[status]:
					status_dict[patient_id].append(int(datetime))
				for gl_result, datetime in query_result["gl"]:
					gl_dict[patient_id].append((int(datetime), float(gl_result)))

	# for each patient sort the result for faster time matching
	for patient_id in status_dict: status_dict[patient_id].sort()
	for patient_id in gl_dict: gl_dict[patient_id].sort(key = lambda x: x[0])

	# matching result
	statistic_result = dict()
	for patient_id in status_dict:
		# patient should also has glucose record
		if patient_id not in gl_dict: continue

		# get the patient's status and glucose data
		status_list, gl_list = status_dict[patient_id], gl_dict[patient_id]
		
		status_idx, gl_idx = 0, 0
		while status_idx < len(status_list):
			# set a sliding window to find match datetime, the valid glucose datetime should in the sliding window.
			left = status_list[status_idx]
			right = min(status_list[status_idx] + time_max_duration,
						float("inf") if status_idx >= len(status_list) - 1 else status_list[status_idx + 1])

			# ensure the glucose datetime bigger than the window's left side
			while gl_idx < len(gl_list) and left > gl_list[gl_idx][0]: gl_idx += 1
			
			while gl_idx < len(gl_list):
				# if the glucose datetime smaller than the windows's right side
				if right > gl_list[gl_idx][0]:
					# get the datetime diff
					diff = timestamp_trans_list[gl_list[gl_idx][0] - left]
					if diff not in statistic_result: statistic_result[diff] = [0, 0]
					
					# append the glucose result into the datetime diff
					statistic_result[diff][0] += float(gl_list[gl_idx][1])
					statistic_result[diff][1] += 1

					# check for next glucose result
					gl_idx += 1
				# if the glucose datetime bigger than the windows's right side, we need to update the window
				else: break

			# get next status	
			status_idx += 1

	# get the query result from statistic result		
	result = {timestamp: statistic_result[timestamp][0] / statistic_result[timestamp][1] for timestamp in statistic_result}

	# return the plot result
	return get_plot_json(result)