def test_path_resolution(): '''Test that paths are correctly resolved''' from config import Path path = Path() expected = pathlib.Path(os.environ['HOME']) / 'bar' assert path.validate('~/foo/../bar') == expected
def create_dataset_folder_structure(): """ Creates the folder structure for the new dataset_operations. """ path = Path(f'{DATASETS}/{FEATURES_DATASET}') if not os.path.exists(path): print( f'\nWARNING: The path does not exist. Creating new directory...\n{path}\n' ) os.mkdir(path) try: for path in new_sensor_paths: if not os.path.exists(path): print( f'\nWARNING: The path does not exist. Creating new directory...\n{path}\n' ) os.mkdir(path) else: print("\nPath already exists!") except: return False else: return True
def test_path_validate(tmp_path): from config import Path not_existing_path = tmp_path / 'nope' existing_dir = tmp_path existing_file = tmp_path / 'yes' with existing_file.open('w'): pass item = Path() assert item.validate(None) is None assert item.validate(not_existing_path) == not_existing_path assert item.validate(existing_dir) == existing_dir assert item.validate(existing_file) == existing_file item = Path(dir_okay=False) assert item.validate(None) is None assert item.validate(not_existing_path) == not_existing_path assert item.validate(existing_file) == existing_file with pytest.raises(ConfigError): item.validate(existing_dir) item = Path(file_okay=False) assert item.validate(None) is None assert item.validate(not_existing_path) == not_existing_path assert item.validate(existing_dir) == existing_dir with pytest.raises(ConfigError): item.validate(existing_file) item = Path(exists=False) assert item.validate(None) is None assert item.validate(not_existing_path) == not_existing_path with pytest.raises(ConfigError): item.validate(existing_file) with pytest.raises(ConfigError): item.validate(existing_dir) item = Path(allow_none=False) with pytest.raises(ConfigError): item.validate(None)
def process_repository(session, status, repository, query_iter): query_iter = list(query_iter) zip_path = None tarzip = None if not repository.path.exists(): if not repository.zip_path.exists(): repository.processed |= consts.R_UNAVAILABLE_FILES session.add(repository) status.count += len(query_iter) return "Failed. Repository not found: {}".format(repository) tarzip = tarfile.open(str(repository.zip_path)) zip_path = Path(repository.hash_dir2) shell = InteractiveShell.instance() group = groupby( query_iter, lambda x: (x[1]) ) for notebook, new_iter in group: cells = list(query_iter) vprint(1, "Processing notebook: {}. Found {} cells".format(notebook, len(cells))) name = notebook.name vprint(2, "Loading notebook file") if tarzip: notebook = nbf.read( tarzip.extractfile(tarzip.getmember(str(zip_path / name))), nbf.NO_CONVERT ) else: with open(str(repository.path / name)) as ofile: notebook = nbf.read(ofile, nbf.NO_CONVERT) notebook = nbf.convert(notebook, 4) metadata = notebook["metadata"] language_info = metadata.get("language_info", {}) language_name = language_info.get("name", "unknown") for cell, _, _ in new_iter: vprint(2, "Loading cell {}".format(cell.index)) index = int(cell.index) notebook_cell = notebook["cells"][index] source = notebook_cell.get("source", "") if language_name == "python" and notebook_cell.get("cell_type") == "code": try: source = shell.input_transformer_manager.transform_cell(source) except (IndentationError, SyntaxError): pass cell.source = source if cell.processed & consts.C_MARKED_FOR_EXTRACTION: cell.processed -= consts.C_MARKED_FOR_EXTRACTION session.add(cell) session.commit() return "ok"
def sort_dataset_by_age(): """ Sorts the Dataset created by create_dataset() into a new Age sorted Dataset. """ data = read_csv(Path(f'{data_files_path}/subject_data')) limits = get_limits(ageGroups) sortedCount = 0 # For every age bin for target_folder, limit in limits.items(): # Get the indexes of all files to be copied to the target folder index_list = list(data[(data['Age'] >= limit[0]) & (data['Age'] <= limit[1])].index) subjectCount = 0 # For every file to be copied for i in index_list: filename = data.iloc[i]['Filename'] temp = sortedCount # Get the source and destination file paths for src, dest in zip(new_sensor_paths, sensor_dirs[target_folder]): # if the file exists in the source directory if os.path.exists(Path(f'{src}/{filename[:-4]}.csv')): # copy it to the destination directory copyfile(Path(f'{src}/{filename[:-4]}.csv'), Path(f'{dest}/{filename[:-4]}.csv')) if temp == sortedCount: sortedCount += 1 subjectCount += 1 # print(f'src = {src}\ndest = {dest}\n\n') print(f'\n# of Subjects in "{target_folder}" = {subjectCount}') print( f'\nTotal subjects sorted = {sortedCount} ({round((sortedCount / len(data)) * 100, 2)}% of total data)\n' )
def create_dataset(subs_list, indexing=True): """ Creates the New Dataset using features calculated from the base data. Parameters ---------- subs_list : list list of subjects to create the new dataset_operations for indexing : bool, optional dataset_operations index column (default = True) """ S = None print( f'\nProcess - {current_process().name} has {len(subs_list)} files to work on.\n' ) try: start = time() repo = (Subject(sub) for sub in subs_list) for sub in repo: S = sub for i in range(3): filePath = Path( f'{new_sensor_paths[i]}/{sub.subject_id[:-4]}.csv') if not os.path.exists(filePath): # Most expensive line of code in the module (Takes hours) col_names, df, _, _, _ = feature_extractor( sub, sensors[i].lower(), output_type='df') df.to_csv(filePath, sep="\t", index=indexing) print( f"File generated - '{sub.subject_id[:-4]}.csv' by process : {current_process().name}" ) else: print(f'File "{sub.subject_id[:-4]}.csv" already exists!') print( f'\nTime taken by - {current_process().name} : {time() - start:.2f} secs' ) except Exception as e: print(f"Exception occurred in {current_process().name}\n") print(f'While working on this portion of the subs_list:\n' f'{subs_list}') print(f'Error occurred in FILE # {S.subject_id}\n') raise e
def create_age_folder_structure(): """ Creates the folder structure for the Age Sorted Dataset. """ try: new_dataset_path = Path(f'{DATASETS}/{FEATURES_DATASET}_Age_Sorted') if not os.path.exists(new_dataset_path): print( f'\nWARNING: The path does not exist. Creating new directory...\n{new_dataset_path}\n' ) os.mkdir(new_dataset_path) except: print( "ERROR in creating the sorted dataset_operations directory within folder /Data Sets" ) return False try: for folder, age_dir in age_dirs.items(): if not os.path.exists(age_dir): os.mkdir(age_dir) else: print(f"The directory {folder} already exists.") except: print( "ERROR in creating age based directories in /Data Sets/Dataset_Age_Sorted" ) return False try: for sub_folder, sensor_dir in sensor_dirs.items(): for sub_path in sensor_dir: if not os.path.exists(sub_path): os.mkdir(sub_path) else: print(f"The directory {sub_path} already exists.") return True except: print( "ERROR in creating sensor directories in /Data Sets/Dataset_Age_Sorted/[age_Groups]" ) return False
def main(): parser = argparse.ArgumentParser( description="Check pid") parser.add_argument("-c", "--count", action='store_true', help="count active processes") parser.add_argument("-e", "--clear", action='store_true', help="clear not running processes") parser.add_argument("-s", "--simplify", action='store_true', help="simplify output") args = parser.parse_args() if not Path(".pid").exists(): return with open(".pid", "r") as fil: pids = fil.readlines() new_pids = [] for pid in pids: pid = pid.strip() if not pid: continue try: process = psutil.Process(int(pid)) if not args.count: cmd = process.cmdline() if args.simplify and len(cmd) > 20: cmd = cmd[:20] cmd.append("...") print("{}: {}".format(pid, " ".join(cmd))) new_pids.append(pid) except psutil.NoSuchProcess: if not args.count and not args.clear: print("{}: <not found>".format(pid)) if args.count: print(len(new_pids)) if args.clear: with open(".pid", "w") as fil: fil.write("\n".join(new_pids) + "\n")
def file_exists(subs_list): """ Checks to see if any previous files with feature extracted data exist in the Dataset and returns the updated list of files which don't exist in the Dataset. This is done because generating the files is expensive and this avoids having to start over from scratch. Parameters ---------- subs_list : list Complete subjects list Returns ------- updated_subs : list list of subject files which are not already in the new Dataset """ updated_subs = [] print(f'Checking for existing files in directories:\n') for dir in new_sensor_paths: print(f'{dir}') updated_subs += subs_list print() for sub in subs_list: for i in range(3): filePath = Path(f'{new_sensor_paths[i]}/{sub[:-4]}.csv') if not os.path.exists(filePath): pass else: updated_subs.pop(updated_subs.index(sub)) updated_subs = list(sorted(set(updated_subs))) print(f'There were {len(subs_list) - len(updated_subs)} existing files!\n') print( f'The updated subjects list now contains {len(updated_subs)} entries.\n' ) return updated_subs
class Foo(Configurable): path = Path(allow_none=False)
# Configuration Variables # ------------------------ GENERATE_DATASET = True SORT_BY_AGE = False TESTING = True TEST_COUNT = 8 # Should be >= 4 # ------------------------ if not TESTING: FEATURES_DATASET = FEATURES_DATASET else: FEATURES_DATASET = FEATURES_DATASET + "_TEST" new_sensor_paths = [ Path(f"{DATASETS}/{FEATURES_DATASET}/{sensor}") for sensor in sensors ] if not os.path.exists(DATASETS): print( f'\nWARNING: The path does not exist. Creating new directory...\n{DATASETS}\n' ) os.mkdir(DATASETS) def create_dataset_folder_structure(): """ Creates the folder structure for the new dataset_operations. """
# Performance metric to optimize the model for SCORING = 'f1_weighted' # Set to True if TESTING with the Python CONSOLE TESTING = False # If True, the dataset_operations is normalized before training & testing DATA_NORMALIZATION = True # If True, a selected portion of the entire dataset_operations is used for training+testing (# of rows = row_count) DATA_REDUCE = False # If True, generate a .csv file for the feature ranking GEN_RANKING_FILE = False # If True, a plot will be generated for the # of features used vs performance metric PLOT = False # If True, trained model is exported to TRAINED_MODEL_PATH EXPORT_MODEL = False # Paths # Directory name for new data set which contains the training/testing data for the classifier PROCESSED_DATASET = "Processed_Dataset" # Directory path for new data set which contains the training/testing data for the classifier PROCESSED_DATASET_PATH = Path(f'{DATASETS}/{PROCESSED_DATASET}') # loading in the actual dataset for the ML classifier DATA_PATH = Path(f"{PROCESSED_DATASET_PATH}/ds_all.csv") # Trained Model directory name TRAINED_MODEL_DIR = 'Trained Models' # Trained Model directory path TRAINED_MODEL_PATH = Path(f'{ROOT}/{TRAINED_MODEL_DIR}') # Trained Model name TRAINED_MODEL_NAME = 'step_detection_model_test.pkl' # Trained Normalizer name TRAINED_NORMALIZER_NAME = 'step_detection_min_max_norm_test.pkl'