def scrub_pixels(bid): '''scrub pixels (not currently triggered) will be run to scrub pixel data before identifiers are extracted from the header. ''' from .get import get_identifiers batch = Batch.objects.get(id=bid) images = batch.image_set.all() batch.change_images_status('PROCESSING') # from deid.dicom import scrub_pixels for dcm in images: dcm_file = dcm.image.path dicom_uid = os.path.basename(dcm_file) dicom = dcm.load_dicom() if dicom.get("BurnedInAnnotation") is not None: # We shouldn't be in this function if False, but we will check again anyway if ANONYMIZE_PIXELS is True: print("Anonymization will be done here.") else: message = "%s has pixel identifiers, anonymize pixels is off, but added to batch. Removing!" % dcm_file dicom.delete( ) # if django-cleanup not in apps, will not delete image file batch = add_batch_error(message, batch) # At the end, move on to processing headers return get_identifiers(bid=batch.id)
def replace_identifiers(bid, run_upload_storage=False): '''replace identifiers is called from get_identifiers, given that the user has asked to anonymize_restful. This function will do the replacement, and then trigger the function to send to storage ''' batch = Batch.objects.get(id=bid) batch.qa['ProcessStartTime'] = time.time() batch_ids = BatchIdentifiers.objects.get(batch=batch) # 1) use response from API to generate new fields working = deepcopy(batch_ids.ids) prepared = prepare_identifiers(response=batch_ids.response, ids=working) updated = deepcopy(prepared) # 3) use response from API to anonymize all fields in batch.ids # clean_identifiers(ids, deid=None, image_type=None, default=None) # deid as None will use default "deid.dicom" provided in application # specifying a custom file/tag will use this filter first (in addition) deid = STUDY_DEID cleaned = clean_identifiers(ids=updated, default="KEEP", deid=deid) # Save progress batch_ids.cleaned = cleaned batch_ids.updated = updated batch_ids.save() # Get updated files dicom_files = batch.get_image_paths() output_folder = batch.get_path() updated_files = replace_ids( dicom_files=dicom_files, deid=deid, ids=updated, # ids[item] lookup overwrite=True, # overwrites copied files output_folder=output_folder, strip_sequences=True, remove_private=True) # force = True # save = True, # Get shared information aggregate = ["BodyPartExamined", "Modality", "StudyDescription"] shared_ids = get_shared_identifiers(dicom_files=updated_files, aggregate=aggregate) batch_ids.shared = shared_ids batch_ids.save() # Rename for dcm in batch.image_set.all(): item_id = os.path.basename(dcm.image.path) try: dicom = dcm.load_dicom() # S6M0<MRN-SUID>_<JITTERED-REPORT-DATE>_<ACCESSIONNUMBER-SUID> # Rename the dicom based on suid if item_id in updated: item_suid = updated[item_id]['item_id'] dcm = dcm.rename(item_suid) # added to [prefix][dcm.name] dcm.save() # If we don't have the id, don't risk uploading else: message = "%s for Image Id %s file read error: skipping." % ( item_id, dcm.id) batch = add_batch_error(message, batch) dcm.delete() except: message = "%s for Image Id %s not found in lookup: skipping." % ( item_id, dcm.id) batch = add_batch_error(message, batch) dcm.delete() batch.qa['ProcessFinishTime'] = time.time() # We don't get here if the call above failed change_status(batch, "DONEPROCESSING") batch.save() if run_upload_storage is True: return upload_storage(batch_ids=[bid]) else: updated_files = batch.get_image_paths() return updated_files
def import_dicomdir(dicom_dir, run_get_identifiers=True): '''import dicom directory manages importing a valid dicom set into the application, and is a celery job triggered by the watcher. Here we also flag (and disclude) images that have a header value that indicates pixel identifiers. ''' start_time = time.time() if os.path.exists(dicom_dir): try: dicom_files = ls_fullpath(dicom_dir) except NotADirectoryError: bot.error('%s is not a directory, skipping.' % dicom_dir) return bot.debug("Importing %s, found %s .dcm files" % (dicom_dir, len(dicom_files))) # The batch --> the folder with a set of dicoms tied to one request dcm_folder = os.path.basename(dicom_dir) batch, created = Batch.objects.get_or_create(uid=dcm_folder) batch.logs['STARTING_IMAGE_COUNT'] = len(dicom_files) # Data quality check: keep a record of study dates study_dates = dict() size_bytes = sum(os.path.getsize(f) for f in dicom_files) messages = [] # print all unique messages / warnings at end # Add in each dicom file to the series for dcm_file in dicom_files: try: # The dicom folder will be named based on the accession# dcm = read_file(dcm_file, force=True) dicom_uid = os.path.basename(dcm_file) # Keep track of studyDate study_date = dcm.get('StudyDate') if study_date not in study_dates: study_dates[study_date] = 0 study_dates[study_date] += 1 flag, flag_group, reason = has_burned_pixels( dicom_file=dcm_file, quiet=True, deid=STUDY_DEID) # If the image is flagged, we don't include and move on continue_processing = True if flag is True: if flag_group not in ["whitelist"]: continue_processing = False message = "%s is flagged in %s: %s, skipping" % ( dicom_uid, flag_group, reason) batch = add_batch_warning(message, batch, quiet=True) message = "BurnedInAnnotation found for batch %s" % batch.uid if message not in messages: messages.append(message) if continue_processing is True: # Create the Image object in the database # A dicom instance number must be unique for its batch dicom = Image.objects.create(batch=batch, uid=dicom_uid) # Save the dicom file to storage # basename = "%s/%s" %(batch.id,os.path.basename(dcm_file)) dicom = save_image_dicom(dicom=dicom, dicom_file=dcm_file) # Also saves # Generate image name based on [SUID] added later # accessionnumberSUID.seriesnumber.imagenumber, name = "%s_%s.dcm" % (dcm.get('SeriesNumber'), dcm.get('InstanceNumber')) dicom.name = name dicom.save() # Only remove files successfully imported #os.remove(dcm_file) # Note that on error we don't remove files except InvalidDicomError: message = "InvalidDicomError: %s skipping." % (dcm_file) batch = add_batch_error(message, batch) except KeyError: message = "KeyError: %s is possibly invalid, skipping." % ( dcm_file) batch = add_batch_error(message, batch) except Exception as e: message = "Exception: %s, for %s, skipping." % (e, dcm_file) # Print summary messages all at once for message in messages: bot.warning(message) if len(study_dates) > 1: message = "% study dates found for %s" % (len(study_dates), dcm_file) batch = add_batch_error(message, batch) # Save batch thus far batch.qa['StudyDate'] = study_dates batch.qa['StartTime'] = start_time batch.qa['SizeBytes'] = size_bytes batch.save() # If there were no errors on import, we should remove the directory #if not batch.has_error: # Should only be called given no error, and should trigger error if not empty #os.rmdir(dicom_dir) # At the end, submit the dicoms to be anonymized as a batch count = batch.image_set.count() if count > 0: if ANONYMIZE_PIXELS is True: bot.warning( "Anonimization of pixels is not yet implemented. Images were skipped." ) # When this is implemented, the function will be modified to add these images # to the batch, which will then be first sent through a function to # scrub pixels before header data is looked at. # scrub_pixels(bid=batch.id) #else: if run_get_identifiers is True: bot.debug("get_identifiers submit batch %s with %s dicoms." % (batch.uid, count)) return get_identifiers(bid=batch.id) else: bot.debug("Finished batch %s with %s dicoms" % (batch.uid, count)) return batch else: # No images for further processing batch.status = "EMPTY" batch.qa['FinishTime'] = time.time() message = "%s is flagged EMPTY, no images pass filter" % (batch.id) batch = add_batch_warning(message, batch) batch.save() return else: bot.warning('Cannot find %s' % dicom_dir)
def get_identifiers(bid, study=None, run_replace_identifiers=True): '''get identifiers is the celery task to get identifiers for all images in a batch. A batch is a set of dicom files that may include more than one series/study. This is done by way of sending one restful call to the DASHER endpoint. If ANONYMIZE_RESTFUL is False under settings, this function doesn't run ''' batch = Batch.objects.get(id=bid) if study is None: study = SOM_STUDY if ANONYMIZE_RESTFUL is True: images = batch.image_set.all() # Process all dicoms at once, one call to the API dicom_files = batch.get_image_paths() batch.change_images_status('PROCESSING') batch.save() # redundant try: ids = get_ids(dicom_files=dicom_files, expand_sequences=False ) # we are uploading a zip, doesn't make sense # to preserve image level metadata except FileNotFoundError: batch.status = "ERROR" message = "batch %s is missing dicom files and should be reprocessed" % ( batch.id) batch = add_batch_warning(message, batch) batch.save() # Prepare identifiers with only minimal required # This function expects many items for one entity, returns # request['identifiers'] --> [ entity-with-study-item ] request = prepare_identifiers_request(ids) # force: True bot.debug("som.client making request to anonymize batch %s" % (bid)) # Run with retrying, in case issue with token refresh result = None try: result = run_client(study, request) except: # But any error, don't continue, don't launch new job message = "error with client, stopping job." batch = add_batch_error(message, batch) batch.status = "ERROR" batch.qa['FinishTime'] = time.time() batch.save() # Create a batch for all results if result is not None: if "results" in result: batch_ids, created = BatchIdentifiers.objects.get_or_create( batch=batch) batch_ids.response = result['results'] batch_ids.ids = ids batch_ids.save() batch.qa['DasherFinishTime'] = time.time() if run_replace_identifiers is True: return replace_identifiers(bid=bid) else: return batch_ids else: message = "'results' field not found in response: %s" % result batch = add_batch_error(message, batch) else: bot.debug( "Restful de-identification skipped [ANONYMIZE_RESTFUL is False]") change_status(batch, "DONEPROCESSING") change_status(batch.image_set.all(), "DONEPROCESSING")