def _get_csv_mrns(args): mrns = set() if args.path_to_csv_deidentified is not None: # Get list of full paths to CSV files fpaths = [] if os.path.isdir(args.path_to_csv): for root, dirs, fnames in os.walk(args.path_to_csv): for fname in fnames: split = os.path.splitext(fname) if split[-1] != CSV_EXT: continue fpath = os.path.join(root, fname) fpaths.append(fpath) # If user gave path to single CSV, instead of a directory, use that path else: fpaths.append(args.path_to_csv) # Iterate over paths to CSV files for fpath in fpaths: try: _mrns = patient_csv_to_set(patient_csv=fpath) except ValueError: print( f"Could not get MRNs from {fpath}, skipping de-identification" ) global path_of_csv_to_skip path_of_csv_to_skip.add(fpath) continue _mrns = {int(mrn) for mrn in _mrns} mrns |= _mrns return mrns
def test_patient_csv(self, patient_csv: DATA_SPLIT): csv_path, patient_ids = patient_csv sample_set = patient_csv_to_set(csv_path) assert open(csv_path).readline() != "patient_id\n" assert all([patient_id in sample_set for patient_id in patient_ids]) assert len(patient_ids) == len(sample_set)
def test_patient_csv_duplicates(self, patient_csv: DATA_SPLIT): csv_path, patient_ids = patient_csv sample_set = patient_csv_to_set(csv_path) assert open(csv_path).readline() != "patient_id\n" assert all([patient_id in sample_set for patient_id in patient_ids]) assert len(patient_ids) == len(sample_set) with open(csv_path) as csv_file: dupe_set = set() has_dupe = False for line in csv_file: if line in dupe_set: has_dupe = True dupe_set.add(line) assert has_dupe
def check_structure(self, patient_csv: str = None, remove_flag: bool = False): """ Checks if edw_dir is structured properly. :param patient_csv: <str> Path to CSV with MRNs to parse; no other MRNs will be parsed. :param remove_flag: <bool> Flag to remove files with wrong or empty format. """ self._check_adt(remove_flag) expected_columns = {} for element in EDW_FILES: columns: Set[str] = set() for col in EDW_FILES[element]["columns"]: columns &= set(col if isinstance(col, list) else [col]) expected_columns[EDW_FILES[element]["name"]] = columns expected_files = set(expected_columns.keys()) expected_files.remove(EDW_FILES["adt_file"]["name"]) mrns_folders = [ os.path.join(self.edw_dir, folder) for folder in os.listdir(self.edw_dir) if os.path.isdir(os.path.join(self.edw_dir, folder)) ] if patient_csv: mrns = patient_csv_to_set(patient_csv) for mrn_folder in mrns_folders: if patient_csv and mrn_folder not in mrns: continue csns_folders = [ os.path.join(mrn_folder, folder) for folder in os.listdir(mrn_folder) if os.path.isdir(os.path.join(mrn_folder, folder)) ] unexpected_files = [ os.path.join(mrn_folder, file_name) for file_name in os.listdir(mrn_folder) if not os.path.isdir(os.path.join(mrn_folder, file_name)) ] # Check that there is at least one folder inside each mrn folder. if len(csns_folders) < 1: logging.error( f"Wrong folder format: {mrn_folder} doesn't contain any folder.", ) # Check if there are any unexpected files in mrns folders. if len(unexpected_files) > 0: logging.warning( f"Unexpected files: {sorted(unexpected_files)}. Just " "folders should be stored inside mrns folders.", ) for csn_folder in csns_folders: files = set(os.listdir(csn_folder)) missing_files = expected_files.difference(files) unexpected = files.difference(expected_files) # Check that inside each csn folder are found all the # expected .csv. if len(missing_files) > 0: logging.error( "Wrong folder format: the files " f"{sorted(missing_files)} were not found in the " f"input folder {csn_folder}.", ) # Check that all the expected_files have the expected format. for file_name in expected_files.intersection(files): full_file_path = os.path.join(csn_folder, file_name) file_expected_columns = expected_columns[file_name] self._check_file_columns( full_file_path, file_expected_columns, remove_flag, ) # Check if if there are any unexpected file in csns folders. if len(unexpected) > 0: unexpected_list = [ os.path.join(csn_folder, unexpected_file) for unexpected_file in unexpected ] logging.warning( f"Unexpected files: {sorted(unexpected_list)}. Just " "the specific .csv files should be saved in csns folders.", )