def get_identifiers( dicom_files, force=True, config=None, strip_sequences=False, remove_private=False ): """ extract all identifiers from a dicom image. This function returns a lookup by file name, where each value indexed includes a dictionary of nested fields (indexed by nested tag). Parameters ========== dicom_files: the dicom file(s) to extract from force: force reading the file (default True) config: if None, uses default in provided module folder strip_sequences: if True, remove all sequences remove_private: remove private tags """ if config is None: config = "%s/config.json" % here if not os.path.exists(config): bot.error("Cannot find config %s, exiting" % (config)) config = read_json(config, ordered_dict=True)["get"] if not isinstance(dicom_files, list): dicom_files = [dicom_files] bot.debug("Extracting identifiers for %s dicom" % len(dicom_files)) lookup = dict() # Parse each dicom file for dicom_file in dicom_files: parser = DicomParser(dicom_file, force=force) lookup[parser.dicom_file] = parser.get_fields() return lookup
def _clean_item(item, deid, default="KEEP"): '''clean a single item according to a deid specification. This function is expected to be called from clean_identifiers below Parameters ========== item: the item dictionary to clean deid: the already loaded deid, with a header section with actions to specify how to clean ''' # Keep track of the fields we've seen, not to blank them seen = [] for action in deid['header']: item,fields = perform_action(item=item, action=action, return_seen=True) seen = seen + [f for f in fields if f not in seen] remaining = [x for x in item.keys() if x not in seen] # Apply default action to remaining fields if len(remaining) > 0 and default != "KEEP": bot.debug("%s fields set for default action %s" %(len(remaining),default)) for field in remaining: action = {'action': default, "field":field} item = perform_action(item=item, action=action) return item
def validate_dicoms(dcm_files, force=False): """validate dicoms will test opening one or more dicom files, and return a list of valid files. Parameters ========== dcm_files: one or more dicom files to test """ if not isinstance(dcm_files, list): dcm_files = [dcm_files] valids = [] bot.debug("Checking %s dicom files for validation." % (len(dcm_files))) for dcm_file in dcm_files: try: with open(dcm_file, "rb") as filey: read_file(filey, force=force) valids.append(dcm_file) except: bot.warning( "Cannot read input file {0!s}, skipping.".format(dcm_file)) bot.debug("Found %s valid dicom files" % (len(valids))) return valids
def get_files(contenders, check=True, pattern=None, force=False): '''get_dcm_files will take a list of single dicom files or directories, and return a generator that yields complete paths to all files :param pattern: A pattern to use with fnmatch. If None, * is used :param force: force reading of the files, if some headers invalid. Not recommended, as many non-dicom will come through ''' if not isinstance(contenders, list): contenders = [contenders] for contender in contenders: if os.path.isdir(contender): dicom_files = recursive_find(contender, pattern=pattern) else: dicom_files = [contender] for dicom_file in dicom_files: if dicom_file is not None: if check: validated_files = validate_dicoms(dicom_file, force=force) else: validated_files = [dicom_file] for validated_file in validated_files: bot.debug("Found contender file %s" % (validated_file)) yield validated_file
def remove_private_identifiers(dicom_files, save=True, overwrite=False, output_folder=None, force=True): """remove_private_identifiers is a wrapper for the simple call to dicom.remove_private_tags, it simply reads in the files for the user and saves accordingly """ updated_files = [] if not isinstance(dicom_files, list): dicom_files = [dicom_files] for dicom_file in dicom_files: dicom = read_file(dicom_file, force=force) dicom.remove_private_tags() dicom_name = os.path.basename(dicom_file) bot.debug("Removed private identifiers for %s" % dicom_name) if save: dicom = save_dicom( dicom=dicom, dicom_file=dicom_file, output_folder=output_folder, overwrite=overwrite, ) updated_files.append(dicom) return updated_files
def _perform_action(field, item, action, value=None): '''_perform_action is the base function for performing an action. It is equivalent to the dicom module version, except we work with dictionary field/value instead of dicom headers. If no action is done, None is returned ''' done = False if action not in valid_actions: bot.warning('%s in not a valid choice [%s]. Defaulting to blanked.' % (action, ".".join(valid_actions))) action = "BLANK" if field in item and action != "ADD": # Blank the value if action == "BLANK": item[field] = "" done = True # Code the value with something in the response elif action == "REPLACE": value = parse_value(item, value) if value is not None: done = True item[field] = value else: bot.warning("REPLACE failed for %s" % field) # Code the value with something in the response elif action == "JITTER": value = parse_value(item, value) if value is not None: done = True item = jitter_timestamp(field=field, value=value, item=item) else: bot.warning('JITTER failed for %s' % field) # Do nothing. Keep the original elif action == "KEEP": done = True bot.debug('KEEP %s' % field) # Remove the field entirely elif action == "REMOVE": del item[field] done = True if not done: bot.warning("%s not done for %s" % (action, field)) elif action == "ADD": value = parse_value(item, value) if value is not None: item[field] = value else: bot.warning('ADD failed for %s' % field) return item
def parse_group_action(section, line, config, section_name): """parse a group action, either FIELD or SPLIT, which must belong to either a fields or values section. Parameters ========= section: a valid section name from the deid config file line: the line content to parse for the section/action config: the growing/current config dictionary section_name: optionally, a section name """ if not line.upper().startswith(group_actions): bot.exit("%s is not a valid group action." % line) if not line.upper().startswith("FIELD") and section == "fields": bot.exit("%fields only supports FIELD actions.") # We may have to deal with cases of spaces bot.debug("%s: adding %s" % (section, line)) parts = line.split(" ") action = parts.pop(0).replace(" ", "") # Both require some parts if not parts: bot.exit("%s action %s requires additional arguments" % (section, action)) # For both, the second is always a field or field expander field = parts.pop(0) # Fields supports one or more fields with expanders (no third arguments) if section == "fields": config[section][section_name].append({ "action": action, "field": field }) # Values supports FIELD or SPLIT elif section == "values": # If we have a third set of arguments if parts: value = _remove_comments(parts) config[section][section_name].append({ "action": action, "field": field, "value": value }) else: config[section][section_name].append({ "action": action, "field": field }) return config
def get_identifiers(dicom_files, force=True, config=None, expand_sequences=True, skip_fields=None): """ extract all identifiers from a dicom image. This function returns a lookup by file name, and does not include private tags. Parameters ========== dicom_files: the dicom file(s) to extract from force: force reading the file (default True) config: if None, uses default in provided module folder expand_sequences: if True, expand sequences. Otherwise, skips skip_fields: if not None, added fields to skip """ if config is None: config = "%s/config.json" % here if not os.path.exists(config): bot.error("Cannot find config %s, exiting" % (config)) config = read_json(config, ordered_dict=True)["get"] if not isinstance(dicom_files, list): dicom_files = [dicom_files] bot.debug("Extracting identifiers for %s dicom" % len(dicom_files)) ids = dict() # identifiers # We will skip PixelData skip = config["skip"] if skip_fields is not None: if not isinstance(skip_fields, list): skip_fields = [skip_fields] skip = skip + skip_fields for dicom_file in dicom_files: if isinstance(dicom_file, Dataset): dicom = dicom_file dicom_file = dicom.filename else: dicom = read_file(dicom_file, force=force) if dicom_file not in ids: ids[dicom_file] = dict() ids[dicom_file] = get_fields(dicom, skip=skip, expand_sequences=expand_sequences) return ids
def parse_format(line): """given a line that starts with FORMAT, parse the format of the file and check that it is supported. If not, exit on error. If yes, return the format. Parameters ========== line: the line that starts with format. """ fmt = re.sub("FORMAT|(\s+)", "", line).lower() if fmt not in formats: bot.exit("%s is not a valid format." % fmt) bot.debug("FORMAT set to %s" % fmt) return fmt
def add_tag(dicom, field, value, quiet=False): '''add tag will add a tag only if it's in the (active) DicomDictionary :param dicom: the pydicom.dataset Dataset (pydicom.read_file) :param field: the name of the field to add :param value: the value to set, if name is a valid tag ''' if quiet is False: bot.debug("Attempting ADDITION of %s." % (field)) dicom = change_tag(dicom, field, value) # dicom.data_element("PatientIdentityRemoved") # (0012, 0062) Patient Identity Removed CS: 'Yes' return dicom
def jitter_timestamp(field, value, item): '''if present, jitter a timestamp in dicom field "field" by number of days specified by "value" The value can be positive or negative. ''' value = to_int(value) original = item.get(field, None) if original is not None: jittered = get_timestamp(item_date=original, jitter_days=value, format="%Y%m%d") bot.debug("JITTER %s + (%s): %s" % (original, value, jittered)) item[field] = jittered return item
def parse_action(section, line, config, section_name=None): '''add action will take a line from a deid config file, a config (dictionary), and an active section name (eg header) and add an entry to the config file to perform the action. Parameters ========= section: a valid section name from the deid config file line: the line content to parse for the section/action config: the growing/current config dictionary section_name: optionally, a section name ''' if not line.upper().startswith(actions): bot.error("%s is not a valid action line." % line) sys.exit(1) # We may have to deal with cases of spaces parts = line.split(' ') action = parts.pop(0).replace(' ', '') # What field is the action for? if len(parts) < 1: bot.error("%s requires a FIELD value, but not found." % (action)) sys.exit(1) field = parts.pop(0) # Actions that require a value if action in ["ADD", "REPLACE", "JITTER"]: if len(parts) == 0: bot.error("%s requires a VALUE, but not found" % (action)) sys.exit(1) value = ' '.join(parts[0:]) # get remained of line value = value.split('#')[0] # remove comments bot.debug("Adding %s" % line) # config[section].append({ "action": action, "field": field, "value": value }) # Actions that don't require a value elif action in ["BLANK", "KEEP", "REMOVE"]: bot.debug("%s: adding %s" % (section, line)) config[section].append({"action": action, "field": field}) return config
def get_identifiers(dicom_files, force=True, config=None, expand_sequences=True, skip_fields=None): ''' extract all identifiers from a dicom image. This function returns a lookup by file name Parameters ========== dicom_files: the dicom file(s) to extract from force: force reading the file (default True) config: if None, uses default in provided module folder expand_sequences: if True, expand sequences. otherwise, skips skip_fields: if not None, added fields to skip ''' bot.debug('Extracting identifiers for %s dicom' %(len(dicom_files))) if config is None: config = "%s/config.json" %(here) if not os.path.exists(config): bot.error("Cannot find config %s, exiting" %(config)) config = read_json(config, ordered_dict=True)['get'] if not isinstance(dicom_files,list): dicom_files = [dicom_files] ids = dict() # identifiers # We will skip PixelData skip = config['skip'] if skip_fields is not None: if not isinstance(skip_fields,list): skip_fields = [skip_fields] skip = skip + skip_fields for dicom_file in dicom_files: dicom = read_file(dicom_file,force=True) if dicom_file not in ids: ids[dicom_file] = dict() ids[dicom_file] = get_fields(dicom, skip=skip, expand_sequences=expand_sequences) return ids
def add_section(config, section, section_name=None): '''add section will add a section (and optionally) section name to a config Parameters ========== config: the config (dict) parsed thus far section: the section name to add section_name: an optional name, added as a level ''' if section is None: bot.error( 'You must define a section (e.g. %header) before any action.') sys.exit(1) if section == 'filter' and section_name is None: bot.error("You must provide a name for a filter section.") sys.exit(1) if section not in sections: bot.error("%s is not a valid section." % section) sys.exit(1) if section not in config: # If a section is named, we have more one level (dict) if section_name is not None: config[section] = OrderedDict() config[section][section_name] = [] bot.debug("Adding section %s %s" % (section, section_name)) else: config[section] = [] bot.debug("Adding section %s" % section) return config # Section is in config if section_name is not None and section_name not in config[section]: config[section][section_name] = [] return config
def get_files(contenders, check=True, pattern=None, force=False, tempdir=None): """get_files will take a list of single dicom files or directories, and return a generator that yields complete paths to all files Parameters ========== contenders: a list of files or directories (contenders!) check: boolean to indicate if we should validate dicoms (default True) pattern: A pattern to use with fnmatch. If None, * is used force: force reading of the files, if some headers invalid. Not recommended, as many non-dicom will come through """ if not isinstance(contenders, list): contenders = [contenders] for contender in contenders: if os.path.isdir(contender): dicom_files = recursive_find(contender, pattern=pattern) else: dicom_files = [contender] for dicom_file in dicom_files: dfile, dextension = os.path.splitext(dicom_file) # The code currently only assumes a single-file per zip. This could be # expanded to allow for multiple test files within an archive. if dextension == ".zip": with zipfile.ZipFile(dicom_file, "r") as compressedFile: compressedFile.extractall(tempdir) dicom_file = next( os.path.join(tempdir, f) for f in os.listdir(tempdir) if os.path.isfile(os.path.join(tempdir, f))) if dicom_file is not None: if check: validated_files = validate_dicoms(dicom_file, force=force) else: validated_files = [dicom_file] for validated_file in validated_files: bot.debug("Found contender file %s" % (validated_file)) yield validated_file
def _get_clean_name(self, output_folder, extension="dcm"): """return a full path to an output file, with custom folder and extension. If the output folder isn't yet created, make it. Parameters ========== output_folder: the output folder to create, will be created if doesn't exist. extension: the extension of the file to create a name for, should not start with "." """ if output_folder is None: output_folder = self.output_folder if not os.path.exists(output_folder): bot.debug("Creating output folder %s" % output_folder) os.makedirs(output_folder) basename = re.sub("[.]dicom|[.]dcm", "", os.path.basename(self.dicom_file)) return "%s/cleaned-%s.%s" % (output_folder, basename, extension)
def get_deid(tag=None, exit_on_fail=True, quiet=False, load=False): '''get deid is intended to retrieve the full path of a deid file provided with the software, based on a tag. For example, under deid/data if a file is called "deid.dicom", the tag would be "dicom". Parameters ========== tag: the text that comes after deid to indicate the tag of the file in deid/data exit_on_fail: if None is an acceptable return value, this should be set to False (default is True). quiet: Default False. If None is acceptable, quiet can be set to True load: also load the deid, if resulting path (from path or tag) is not None ''' # no tag/path means load default if tag is None: tag = 'dicom' # If it's already loaded if isinstance(tag, dict): bot.debug('deid is already loaded.') return tag # If it's a path, get full path if os.path.exists(tag): deid = os.path.abspath(tag) else: deid = "%s/deid.%s" % (data_base, tag) if not os.path.exists(deid): if quiet is False: bot.error("Cannot find %s" % (deid)) if exit_on_fail is True: sys.exit(1) else: return None if load is True: return load_deid(deid) return deid
def get_private(dicom): '''get private tags ''' datasets = [dicom] private_tags = [] while len(datasets) > 0: ds = datasets.pop(0) taglist = sorted(ds.keys()) for tag in taglist: with tag_in_exception(tag): if tag in ds: try: data_element = ds[tag] if data_element.tag.is_private: bot.debug(data_element.name) private_tags.append(data_element) if tag in ds and data_element.VR == "SQ": sequence = data_element.value for dataset in sequence: datasets.append(dataset) except IndexError: bot.debug("tag %s key present without value" % tag) except NotImplementedError: bot.debug('tag %s is invalid, skipping' % tag) return private_tags
def validate_dicoms(dcm_files, force=False): '''validate dicoms will test opening one or more dicom files, and return a list of valid files. :param dcm_files: one or more dicom files to test''' if not isinstance(dcm_files, list): dcm_files = [dcm_files] valids = [] bot.debug("Checking %s dicom files for validation." % (len(dcm_files))) for dcm_file in dcm_files: try: with open(dcm_file, 'rb') as filey: dataset = read_file(filey, force=force) valids.append(dcm_file) except: bot.warning( 'Cannot read input file {0!s}, skipping.'.format(dcm_file)) bot.info("Found %s valid dicom files" % (len(valids))) return valids
def extract_values_list(dicom, actions, fields=None): """Given a list of actions for a named group (a list) extract values from the dicom based on the list of actions provided. This function always returns a list intended to update some lookup to be used to further process the dicom. """ values = set() # The function can be provided fields to save re-parsing if not fields: fields = get_fields(dicom) for action in actions: # Extract some subset of fields based on action subset = expand_field_expression( field=action["field"], dicom=dicom, contenders=fields ) # Just grab the entire value string for a field, no parsing if action["action"] == "FIELD": for uid, field in subset.items(): if field.element.value not in ["", None]: values.add(field.element.value) # Split action, can optionally have a "by" and/or minlength parameter elif action["action"] == "SPLIT": # Default values for split are length 1 and character empty space bot.debug("Parsing action %s" % action) split_by = " " minlength = 1 if "value" in action: for param in action["value"].split(";"): param_name, param_val = param.split("=") param_name = param_name.strip() param_val = param_val.strip() # Set a custom parameter legnth if param_name == "minlength": minlength = int(param_val) bot.debug("Minimum length set to %s" % minlength) elif param_name == "by": split_by = param_val.strip("'").strip('"') bot.debug("Splitting value set to %s" % split_by) for uid, field in subset.items(): new_values = (str(field.element.value) or "").split(split_by) for new_value in new_values: if len(new_value) >= minlength: values.add(new_value) else: bot.warning( "Unrecognized action %s for values list extraction." % action["action"] ) return list(values)
def get_files(contenders, check=True, pattern=None, force=False): '''get_dcm_files will take a list of single dicom files or directories, and return a single list of complete paths to all files :param pattern: A pattern to use with fnmatch. If None, * is used :param force: force reading of the files, if some headers invalid. Not recommended, as many non-dicom will come through ''' if not isinstance(contenders, list): contenders = [contenders] dcm_files = [] for contender in contenders: if os.path.isdir(contender): dicom_dir = recursive_find(contender, pattern=pattern) bot.debug("Found %s contender files in %s" % (len(dicom_dir), os.path.basename(contender))) dcm_files.extend(dicom_dir) else: bot.debug("Adding single contender file %s" % (contender)) dcm_files.append(contender) if check: dcm_files = validate_dicoms(dcm_files, force=force) return dcm_files
def parse_config_action(section, line, config, section_name=None): """add action will take a line from a deid config file, a config (dictionary), and an active section name (eg header) and add an entry to the config file to perform the action. Parameters ========= section: a valid section name from the deid config file line: the line content to parse for the section/action config: the growing/current config dictionary section_name: optionally, a section name """ if not line.upper().startswith(actions): bot.exit("%s is not a valid action line." % line) # We may have to deal with cases of spaces parts = line.split(" ") action = parts.pop(0).replace(" ", "") # What field is the action for? if len(parts) < 1: bot.exit("%s requires a FIELD value, but not found." % action) field = parts.pop(0) # Actions that require a value if action in ["ADD", "REPLACE", "JITTER"]: if len(parts) == 0: bot.exit("%s requires a VALUE, but not found" % action) value = _remove_comments(parts) bot.debug("%s: adding %s" % (section, line)) config[section].append({"action": action, "field": field, "value": value}) # Actions that can optionally have a value elif action in ["REMOVE"]: bot.debug("%s: adding %s" % (section, line)) # Case 1: removing without any criteria if len(parts) == 0: config[section].append({"action": action, "field": field}) # Case 2: REMOVE can have a func:is_thing to return boolean else: value = _remove_comments(parts) config[section].append({"action": action, "field": field, "value": value}) # Actions that don't require a value elif action in ["BLANK", "KEEP"]: bot.debug("%s: adding %s" % (section, line)) config[section].append({"action": action, "field": field}) return config
def load_deid(path=None): '''load_deid will return a loaded in (user) deid configuration file that can be used to update a default config.json. If a file path is specified, it is loaded directly. If a folder is specified, we look for a deid file in the folder. If nothing is specified, we assume the user wants to load a deid file in the present working directory. If the user wants to have multiple deid files in a directory, this can be done with an extension that specifies the module, eg; deid.dicom deid.nifti Parameters ========== path: a path to a deid file Returns ======= config: a parsed deid (dictionary) with valid sections ''' path = find_deid(path) # Read in spec, clean up extra spaces and newlines spec = [ x.strip('\n').strip(' ') for x in read_file(path) if x.strip('\n').strip(' ') not in [''] ] spec = [x for x in spec if x not in ['', None]] config = OrderedDict() section = None while len(spec) > 0: # Clean up white trailing/leading space line = spec.pop(0).strip() # Comment if line.startswith("#"): continue # Starts with Format? elif bool(re.match('format', line, re.I)): fmt = re.sub('FORMAT|(\s+)', '', line).lower() if fmt not in formats: bot.error("%s is not a valid format." % fmt) sys.exit(1) # Set format config['format'] = fmt bot.debug("FORMAT set to %s" % fmt) # A new section? elif line.startswith('%'): # Remove any comments line = line.split('#', 1)[0].strip() # Is there a section name? section_name = None parts = line.split(' ') if len(parts) > 1: section_name = ' '.join(parts[1:]) section = re.sub('[%]|(\s+)', '', parts[0]).lower() if section not in sections: bot.error("%s is not a valid section." % section) sys.exit(1) config = add_section(config=config, section=section, section_name=section_name) # An action (replace, blank, remove, keep, jitter) elif line.upper().startswith(actions): # Start of a filter group if line.upper().startswith('LABEL') and section == "filter": members = [] keep_going = True while keep_going is True: next_line = spec[0] if next_line.upper().strip().startswith('LABEL'): keep_going = False elif next_line.upper().strip().startswith("%"): keep_going = False else: new_member = spec.pop(0) members.append(new_member) if len(spec) == 0: keep_going = False # Add the filter label to the config config = parse_label(config=config, section=section, label=line, section_name=section_name, members=members) # Parse the action else: config = parse_action(section=section, section_name=section_name, line=line, config=config) else: bot.debug("%s not recognized to be in valid format, skipping." % line) return config
def get_shared_identifiers(dicom_files, force=True, config=None, aggregate=None, expand_sequences=True): """ extract shared identifiers across a set of dicom files, intended for cases when a set of images (dicom) are being compressed into one file and the file (still) should have some searchable metadata. By default, we remove fields that differ between files. To aggregate unique, define a list of aggregate fields (aggregate). """ bot.debug("Extracting shared identifiers for %s dicom" % (len(dicom_files))) if aggregate is None: aggregate = [] if config is None: config = "%s/config.json" % (here) if not os.path.exists(config): bot.error("Cannot find config %s, exiting" % (config)) config = read_json(config, ordered_dict=True)["get"] if not isinstance(dicom_files, list): dicom_files = [dicom_files] ids = dict() # identifiers # We will skip PixelData skip = config["skip"] for dicom_file in dicom_files: dicom = read_file(dicom_file, force=True) # Get list of fields, expanded sequences are flattened fields = get_fields(dicom, skip=skip, expand_sequences=expand_sequences) for key, val in fields.items(): # If it's there, only keep if the same if key in ids: # Items to aggregate are appended, not removed if key in aggregate: if val not in ids[key]: ids[key].append(val) else: # Keep only if equal between if ids[key] == val: continue else: del ids[key] skip.append(key) else: if key in aggregate: val = [val] ids[key] = val # For any aggregates that are one item, unwrap again for field in aggregate: if field in ids: if len(ids[field]) == 1: ids[field] = ids[field][0] return ids