def remove_processed_subjects(gt3x_files):
    """
	Remove files that are already processed
	It extract the subject ID from the file name location, and checkes if that ID is already part of the HDF5 file

	Parameters
	---------
	gt3x_files : list
		list of file locations of the raw gt3x file

	Returns
	---------
	gt3x_files : list
		filtered list of gt3x files (removed file locations that are already processed)
	"""

    # read already processed subject IDs
    processed_subjects = get_all_subjects_hdf5(hdf5_file=HDF5_SAVE)

    # here we extract the 8 digit subject ID from the file name and see if it has already been processes, if so, then we don't want to include it
    gt3x_files = [
        file for file in gt3x_files
        if re.search(r'[0-9]{8}', file).group(0) not in processed_subjects
    ]

    # return the filtered files
    return gt3x_files
def batch_create_epoch_datasets(use_parallel = True, num_jobs = cpu_count(), limit = None, dataset_prefix = 'epoch'):
	"""
	Create epoch n-seconds datasets, where n could be 10s, 20s, 30s etc. These datasets are pre-created so the grid-search analysis can run faster.
	"""

	# get all the subjects from the hdf5 file and remove subjects with invalid data
	subjects = [s for s in get_all_subjects_hdf5(hdf5_file = ACTIWAVE_ACTIGRAPH_MAPPING_HDF5_FILE) if s not in get_subjects_with_invalid_data()]

	# seconds of epoch data, e.g. 10s epoch, 20s epoch
	S = range(10,61,10)

	if use_parallel:

		# verbose
		logging.info('Processing in parallel (parallelization on)')

		# use parallel processing to speed up processing time
		executor = Parallel(n_jobs = num_jobs, backend = 'multiprocessing')
		# create tasks so we can execute them in parallel
		tasks = (delayed(create_epoch_datasets)(subject = subject, S = S, dataset_prefix = dataset_prefix,  idx = i, total = len(subjects)) for i, subject in enumerate(subjects))
		# execute task
		executor(tasks)

	else:

		# loop over all the subjects and perform sensitivity analysis
		for idx, subject in enumerate(subjects):

			create_epoch_datasets(subject, S, dataset_prefix, idx, len(subjects))
Ejemplo n.º 3
0
def batch_process_non_wear_algorithm(algorithm, limit = None, skip_n = 0, use_parallel = True, num_jobs = cpu_count(), save_hdf5 = ACTIWAVE_ACTIGRAPH_MAPPING_HDF5_FILE):
	"""
	Batch process finding non-wear time based on the following algorithms:
		- hecht_2009_triaxial_calculate_non_wear_time
		- troiano_2007_calculate_non_wear_time
		- choi_2011_calculate_non_wear_time
		- hees_2013_calculate_non_wear_time

	Parameters
	-----------
	algorithm : python function name
		name of the function that processes the non-wear time method
	limit : int (optional)
		limit the number of subjects to be processed
	skip_n : int (optional)
		skip first N subjects
	use_parallel : Boolean (optional)
		Set to true of subjects need to be processed in parallel, this will execute much faster
	num_jobs : int (optional)
		if parallel is set to true, then this indicates have many jobs at the same time need to be executed. Default set to the number of CPU cores
	save_hdf5 : os.path
		location of HDF5 file to save data to
	"""

	# get all the subjects from the hdf5 file and remove subjects with invalid data
	subjects = [s for s in get_all_subjects_hdf5(hdf5_file = ACTIWAVE_ACTIGRAPH_MAPPING_HDF5_FILE) if s not in get_subjects_with_invalid_data()][0 + skip_n:limit]

	logging.info('Start batch processing estimating non-wear time based on {}'.format(algorithm.__name__))

	# loop over the subjects
	if use_parallel:

		# verbose
		logging.info('Processing in parallel (parallelization on)')

		# use parallel processing to speed up processing time
		executor = Parallel(n_jobs = num_jobs, backend = 'multiprocessing')
		# create tasks so we can execute them in parallel
		tasks = (delayed(algorithm)(subject = subject, idx = i, total = len(subjects), save_hdf5 = save_hdf5) for i, subject in enumerate(subjects))
		# execute task
		executor(tasks)

	else:

		# verbose
		logging.info('Processing one-by-one (parallelization off)')

		# loop over the subjects
		for i, subject in enumerate(subjects):

			algorithm(subject = subject, idx = i, total = len(subjects), save_hdf5 = save_hdf5)
Ejemplo n.º 4
0
def batch_process_plot_non_wear_algorithms(limit = None, skip_n = 0, plot_folder = os.path.join('plots', 'non-wear-time', 'algorithms')):
	"""
	Batch process finding non-wear time based on Hecht 2009 algorithm

	Parameters
	-----------
	limit : int (optional)
		limit the number of subjects to be processed
	skip_n : int (optional)
		skip first N subjects
	"""

	# get all the subjects from the hdf5 file (subjects are individuals who participated in the Tromso Study #7
	subjects = [s for s in get_all_subjects_hdf5(hdf5_file = ACTIWAVE_ACTIGRAPH_MAPPING_HDF5_FILE) if s not in get_subjects_with_invalid_data()][0 + skip_n:limit]

	# loop over the subjects
	for i, subject in enumerate(subjects):

		# verbose
		logging.info('Processing subject: {} {}/{}'.format(subject, i, len(subjects)))

		# call plot function
		process_plot_non_wear_algorithms(subject, plot_folder)
def process_gt3x_file(f,
                      i=1,
                      total=1,
                      hdf5_save_location=HDF5_SAVE,
                      delete_zip_folder=True):
    """
	Process .gt3x file
	- unzip into log.bin and info.txt
	- extract information from info.txt
	- extract information from log.bin
	- save data to hdf5 file

	Parameters
	----------
	f : string
		file location of the .gt3x file
	i : int (optional)
		index of file to be processed, is used to display a counter of the process. Default = 1. For example, processing 12/20
	total : int (optional)
		total number of files to be processed, is used to display a counter of the process. Default = 1. For example, processing 12/20
	hdf5_save_location : os.path
		folder location where to save the extracted acceleration data to.
	"""

    logging.debug('Processing GTX3 binary file: {} {}/{}'.format(
        f, i + 1, total))

    # unzip the raw .gt3x file: this will provide a log.bin and info.txt file
    # the save_location is a new folder with the same name as the .gt3x file
    log_bin, info_txt = unzip_gt3x_file(f, save_location=f.split('.')[0])

    # check if unzipping went ok
    if log_bin is not None:

        # print verbose
        logging.debug('log.bin location: {}'.format(log_bin))
        logging.debug('info.txt location: {}'.format(info_txt))

        # get info data from info file
        info_data = extract_info(info_txt)

        # check if subject name could be read from the binary file
        if info_data['Subject_Name'] is not "":

            # check if subject ID already processed
            if info_data['Subject_Name'] not in get_all_subjects_hdf5(
                    hdf5_file=HDF5_SAVE):

                # retrieve log_data; i.e. accellerometer data and log_time; timestamps of acceleration data
                log_data, log_time = extract_log(
                    log_bin,
                    acceleration_scale=float(info_data['Acceleration_Scale']),
                    sample_rate=int(info_data['Sample_Rate']))

                # check if log data is not None (with None something went wrong during reading of the binary file)
                if log_data is not None:

                    # save log_data to HDF5 file
                    save_data_to_group_hdf5(group=info_data['Subject_Name'],
                                            data=log_data,
                                            data_name='log',
                                            meta_data=info_data,
                                            overwrite=True,
                                            hdf5_file=hdf5_save_location)

                    # save log_time data to HDF file
                    save_data_to_group_hdf5(group=info_data['Subject_Name'],
                                            data=log_time,
                                            data_name='time',
                                            meta_data=info_data,
                                            overwrite=True,
                                            hdf5_file=hdf5_save_location)

                else:
                    logging.error(
                        'Unable to convert .gt3x file: {} (subject {})'.format(
                            f, info_data['Subject_Name']))
            else:
                logging.info(
                    'Subject name already defined as group in HDF5 file: {}, skipping..'
                    .format(info_data['Subject_Name']))
        else:
            logging.error(
                "Unable to read subject from info.txt file, skipping file: {}".
                format(f))
    else:
        logging.error("Error unzipping file: {}".format(f))

    # delete the created zip folder
    if delete_zip_folder:
        delete_directory(f.split('.')[0])

    # print time and memory
    set_end(tic, process)
def perform_cv_grid_search(method, nw_method, num_jobs = cpu_count(), save_folder = os.path.join('files', 'grid-search-cv-hecht')):
	"""
	Perform cross validated grid search

	Parameters
	-----------
	method : string
		what type of data to process. Options are 'epoch' and 'raw'
	nw_method : string
		which non wear method to use. Options are 'hecht', 'troiano', 'choi', and 'hees'
	num_jobs : int
		number of parallel processes to use to speed up calculation
	save_folder : os.path
		folder location to save classification results to
	"""

	# create list of all possible grid search parameter values combinations
	combinations, *_ = _get_grid_search_parameter_combinations(nw_method)
	
	# number of cross validations
	cv = 10
	# train / test split
	split = .3
	# cross validation metric
	cv_metric = 'f1'

	# get all the subjects from the hdf5 file and remove subjects with invalid data
	subjects = [s for s in get_all_subjects_hdf5(hdf5_file = ACTIWAVE_ACTIGRAPH_MAPPING_HDF5_FILE) if s not in get_subjects_with_invalid_data()]
 
	# dictionary subject to number of non wear time sequences
	subject_to_nw_sequence = {}

	"""
		LOAD DATA FROM ALL SUBJECTS
	"""

	# empty dictionary to populate with acceleration data
	subjects_data = {x : {'data' : None, 'true_nw_time' : None} for x in subjects}

	# parallel processing
	executor = Parallel(n_jobs = num_jobs, backend = 'multiprocessing')

	# create tasks so we can execute them in parallel
	tasks = (delayed(_read_epoch_and_true_nw_data)(subject = subject, i = i, total = len(subjects), return_epoch = False if method == 'raw' else True) for i, subject in enumerate(subjects))

	# execute tasks and process the return values
	for subject, subject_data, subject_true_nw in executor(tasks):
		
		# add to dictionary. Note that data will be None if method = 'raw'. See function _read_epoch_and_true_nw_data with return_epoch = False
		subjects_data[subject]['data'] = subject_data
		subjects_data[subject]['true_nw_time'] = subject_true_nw

		# get all indexes with non wear time (nw is encoded as 1, 0 = wear time)
		non_wear_indexes = np.where(subject_true_nw == 1)[0]

		# set class label
		subject_to_nw_sequence[subject] = 0 if len(find_consecutive_index_ranges(non_wear_indexes)) == 1 else 1

	"""
		GET TRAINING AND TEST SET
	"""

	# get list with 0 if no non wear time and 1 if non wear time exists somehwere in the true nw sequence (this can be used to create a stratified split)
	subjects_label = [subject_to_nw_sequence[x] for x in subjects]

	# split subjects into training and testing subjects
	# train_subjects, test_subjects, *_ = create_train_test_split(subjects, subjects_label, test_size = split, shuffle = False, random_state = 42, stratify = subjects_label)
	train_subjects, test_subjects, *_ = create_train_test_split(subjects, subjects_label, test_size = split, shuffle = False)

	"""
		PERFORM CROSS VALIDATION
	"""
	# dictionary to store fold results
	all_fold_results = {x : {'combination' : None, 'training_results' : None, 'test_results' : None} for x in range(cv)}

	# start a manager to share the tracker dictionary accross parallel processed
	manager = Manager()
	# create the tracker as a manager dictionary
	subject_combination_tracker = manager.dict()
	
	# loop over eacht fold
	fold_cnt = 0
	for train_idx, test_idx in return_stratified_k_folds(n_splits = cv).split(train_subjects, [subject_to_nw_sequence[s] for s in train_subjects]):

		# keep track of time it takes to complete one fold
		epoch_tic = time.time()

		logging.info('{style} Processing Fold : {} {style}'.format(fold_cnt + 1, style = '='*10))

		# get the training subjects part of the fold
		train_fold_subjects = [train_subjects[x] for x in train_idx]
		# get the test subjects of the fold
		test_fold_subjects = [train_subjects[x] for x in test_idx]

		# keep track of confusion matrix results per combination
		combination_to_confusion_matrix = {x : None for x in combinations}

		# parallel processing of t an i parameters
		executor = Parallel(n_jobs = num_jobs, backend = 'multiprocessing')
		
		# create tasks so we can execute them in parallel
		tasks = (delayed(_calculate_subject_combination_confusion_matrix)(method = method, combination = combination, subjects = train_fold_subjects, nw_method = nw_method, \
				subject_combination_tracker = subject_combination_tracker, subjects_data = subjects_data, idx = idx, total = len(combinations)) for idx, combination in enumerate(combinations))
		
		# execute tasks and process the return values
		for combination, cf_train in executor(tasks):

			combination_to_confusion_matrix[combination] = calculate_classification_performance(*cf_train)
			
			
		logging.debug('-\tItems in combination tracker: {}'.format(len(subject_combination_tracker)))

		# find combination with the highest accuracy 
		top_combination = sorted(combination_to_confusion_matrix.items(), key = lambda item: item[1][cv_metric], reverse = True)[0]

		# apply top combination on test subjects
		_, cv_confusion_test = _calculate_subject_combination_confusion_matrix(method = method, combination = top_combination[0], subjects = test_fold_subjects, nw_method = nw_method, \
						subject_combination_tracker = subject_combination_tracker, subjects_data = subjects_data)

		# save fold results
		all_fold_results[fold_cnt]['combination'] = top_combination[0]
		all_fold_results[fold_cnt]['training_results'] = top_combination[1]
		all_fold_results[fold_cnt]['test_results'] = calculate_classification_performance(*cv_confusion_test)
		
		# verbose
		logging.info('-\ttop combination: {}'.format(all_fold_results[fold_cnt]['combination']))
		logging.info('-\ttraining results: {}'.format(all_fold_results[fold_cnt]['training_results']))
		logging.info('-\ttest results: {}'.format(all_fold_results[fold_cnt]['test_results']))
		logging.info('-\texecuted fold in {} seconds'.format(time.time() - epoch_tic))

		# increase fold counter
		fold_cnt += 1

	# get combination of folds with best accuracy
	top_cv_combination = None 
	top_cv_metric = 0

	for value in all_fold_results.values():
		if value['test_results'][cv_metric] > top_cv_metric:
			top_cv_metric = value['test_results'][cv_metric]
			top_cv_combination = value['combination']
		
	logging.info('Top combination training: {}, {}: {}'.format(top_cv_combination, cv_metric, top_cv_metric))

	# obtain training classification results
	combined_training_results = {'accuracy': [], 'precision': [], 'specificity': [], 'recall': [], 'f1': [], 'ppv': [], 'npv': []}
	for training_results in all_fold_results.values():
		
		for result_key in combined_training_results.keys():

			combined_training_results[result_key].append(training_results['test_results'][result_key])
	
	# calculate average of classification scores
	for key, value in combined_training_results.items():
		combined_training_results[key] = np.nanmean(value)

	logging.info('='*60)	
	logging.info('{}-Fold cross validation Training results: {}'.format(cv, combined_training_results))	

	# try best combination obtained from cross validation on test subjects
	_, confusion_test = _calculate_subject_combination_confusion_matrix(method = method, combination = top_cv_combination, subjects = test_subjects, nw_method = nw_method, subjects_data = subjects_data)

	# get test classification performance
	test_results = calculate_classification_performance(*confusion_test)

	logging.info('{}-Fold cross validation Test results: {}'.format(cv, test_results))	

	# classification results
	classification_data = {	'combination' : top_cv_combination, 
							'training' : all_fold_results,
							'combined_training' : combined_training_results,
							'test' : test_results}
	
	# save classification results to disk
	save_pickle(classification_data, 'cv-grid-search-results-{}'.format(nw_method), save_folder)

	# save tracker
	save_pickle(dict(subject_combination_tracker), 'cv-grid-search-tracker-{}'.format(nw_method), save_folder)
def perform_grid_search(method, nw_method, num_jobs = cpu_count(), save_folder = os.path.join('files', 'grid-search-hecht')):
	"""
	Perform grid search analysis on epoch or raw data. For epoch data, set method = 'epoch', for raw set method = 'raw'

	Parameters
	-----------
	method : string
		what type of data to process. Options are 'epoch' and 'raw'
	nw_method : string
		which non wear method to use. Options are 'hecht', 'troiano', 'choi', and 'hees'
	num_jobs : int
		number of parallel processes to use to speed up calculation
	save_folder : os.path
		folder location to save classification results to
	"""

	# create list of all possible grid search parameter values combinations
	combinations, *_ = _get_grid_search_parameter_combinations(nw_method)
	
	# get all the subjects from the hdf5 file and remove subjects with invalid data
	subjects = [s for s in get_all_subjects_hdf5(hdf5_file = ACTIWAVE_ACTIGRAPH_MAPPING_HDF5_FILE) if s not in get_subjects_with_invalid_data()]

	"""
		LOAD DATA FROM ALL SUBJECTS
	"""

	# empty dictionary to populate with acceleration data
	subjects_data = {x : {'data' : None, 'true_nw_time' : None} for x in subjects}

	# parallel processing
	executor = Parallel(n_jobs = num_jobs, backend = 'multiprocessing')

	# create tasks so we can execute them in parallel
	tasks = (delayed(_read_epoch_and_true_nw_data)(subject = subject, i = i, total = len(subjects), return_epoch = False if method == 'raw' else True) for i, subject in enumerate(subjects))

	# execute tasks and process the return values
	for subject, subject_data, subject_true_nw in executor(tasks):
		
		# add to dictionary. Note that data will be None if method = 'raw'. See function _read_epoch_and_true_nw_data with return_epoch = False
		subjects_data[subject]['data'] = subject_data
		subjects_data[subject]['true_nw_time'] = subject_true_nw

	"""
		PERFORM GRID SEARCH
	"""

	# keep track of confusion matrix results per combination
	combination_to_confusion_matrix = {x : None for x in combinations}

	# parallel processing of t an i parameters
	executor = Parallel(n_jobs = num_jobs, backend = 'multiprocessing')
	
	# create tasks so we can execute them in parallel
	tasks = (delayed(_calculate_subject_combination_confusion_matrix)(method = method, combination = combination, subjects = subjects, nw_method = nw_method, subjects_data = subjects_data, idx = idx, total = len(combinations)) for idx, combination in enumerate(combinations))
	
	# execute tasks and process the return values
	for combination, cf_matrix in executor(tasks):

		# save classification performance
		combination_to_confusion_matrix[combination] = calculate_classification_performance(*cf_matrix)

	# save classification results to disk
	save_pickle(combination_to_confusion_matrix, 'grid-search-results-{}'.format(nw_method), save_folder)