def generate_compressed_file(files, filename=None, mode="w:gz", archive_basename=None): ''' generate a tar.gz file (default) including a set of files ''' if filename is None: filename = "%s.tar.gz" % str(uuid.uuid4()) bot.debug("Compressing %s files into %s" % (len(files), filename)) tar = tarfile.open(filename, mode) if archive_basename is None: archive_basename = os.path.basename(filename).split('.')[0] images_added = 0 for name in files: try: # Make the archive flat with the images basename = "%s/%s" % (archive_basename, os.path.basename(name)) tar.add(name, arcname=basename) images_added += 1 except FileNotFoundError: pass tar.close() if images_added == 0: filename = None return filename
def get_notifier(): '''get notifier will return a basic pyinotify watch manager based on the user's inotify watch paths in settings. if there is an error, returns None. ''' try: import pyinotify except ImportError as e: bot.error("pyinotify is not installed.") return None level = get_level() wm = pyinotify.WatchManager() for path, mask, processor_cls in settings.INOTIFIER_WATCH_PATHS: cls_path = '.'.join(processor_cls.split('.')[0:-1]) cls = processor_cls.split('.')[-1] mod = __import__(cls_path, globals(), locals(), [cls], level) Processor = getattr(mod, cls) wm.add_watch(path, mask, proc_fun=Processor()) bot.debug("Adding watch on %s, processed by %s" % (path, processor_cls)) notifier = pyinotify.Notifier(wm) return notifier
def watcher_message(message, request=None): '''if request is defined, a message is added.``` ''' if request is not None: messages.info(request, message) else: bot.debug(message)
def get_pid_file(quiet=False): '''get_pid_file will return a path to write the pid file, based on the configuration (user settings) ''' try: pid_file = os.path.join(settings.BASE_DIR, 'watcher.pid') except AttributeError: pid_file = os.path.join("/tmp", "watcher.pid") if not quiet: if os.path.exists(pid_file): bot.debug("pid file is at %s" % (pid_file)) else: bot.debug("pid file set to %s" % (pid_file)) return pid_file
def handle(self, *args, **options): nbids = len(options['bid']) if nbids > 0: bot.debug("Inspecting for errors for %s batch ids" % nbids) batches = Batch.objects.filter(id__in=options['bid'], has_error=True) else: batches = Batch.objects.filter(has_error=True) if len(batches) == 0: bot.info("There are no batches with error.") sys.exit(1) for batch in batches: bot.info("\n# %s" % batch.uid) errors = batch.logs['errors'] for error in errors: bot.info(error)
def import_dicomdir(dicom_dir, run_get_identifiers=True): '''import dicom directory manages importing a valid dicom set into the application, and is a celery job triggered by the watcher. Here we also flag (and disclude) images that have a header value that indicates pixel identifiers. ''' start_time = time.time() if os.path.exists(dicom_dir): try: dicom_files = ls_fullpath(dicom_dir) except NotADirectoryError: bot.error('%s is not a directory, skipping.' % dicom_dir) return bot.debug("Importing %s, found %s .dcm files" % (dicom_dir, len(dicom_files))) # The batch --> the folder with a set of dicoms tied to one request dcm_folder = os.path.basename(dicom_dir) batch, created = Batch.objects.get_or_create(uid=dcm_folder) batch.logs['STARTING_IMAGE_COUNT'] = len(dicom_files) # Data quality check: keep a record of study dates study_dates = dict() size_bytes = sum(os.path.getsize(f) for f in dicom_files) messages = [] # print all unique messages / warnings at end # Add in each dicom file to the series for dcm_file in dicom_files: try: # The dicom folder will be named based on the accession# dcm = read_file(dcm_file, force=True) dicom_uid = os.path.basename(dcm_file) # Keep track of studyDate study_date = dcm.get('StudyDate') if study_date not in study_dates: study_dates[study_date] = 0 study_dates[study_date] += 1 flag, flag_group, reason = has_burned_pixels( dicom_file=dcm_file, quiet=True, deid=STUDY_DEID) # If the image is flagged, we don't include and move on continue_processing = True if flag is True: if flag_group not in ["whitelist"]: continue_processing = False message = "%s is flagged in %s: %s, skipping" % ( dicom_uid, flag_group, reason) batch = add_batch_warning(message, batch, quiet=True) message = "BurnedInAnnotation found for batch %s" % batch.uid if message not in messages: messages.append(message) if continue_processing is True: # Create the Image object in the database # A dicom instance number must be unique for its batch dicom = Image.objects.create(batch=batch, uid=dicom_uid) # Save the dicom file to storage # basename = "%s/%s" %(batch.id,os.path.basename(dcm_file)) dicom = save_image_dicom(dicom=dicom, dicom_file=dcm_file) # Also saves # Generate image name based on [SUID] added later # accessionnumberSUID.seriesnumber.imagenumber, name = "%s_%s.dcm" % (dcm.get('SeriesNumber'), dcm.get('InstanceNumber')) dicom.name = name dicom.save() # Only remove files successfully imported #os.remove(dcm_file) # Note that on error we don't remove files except InvalidDicomError: message = "InvalidDicomError: %s skipping." % (dcm_file) batch = add_batch_error(message, batch) except KeyError: message = "KeyError: %s is possibly invalid, skipping." % ( dcm_file) batch = add_batch_error(message, batch) except Exception as e: message = "Exception: %s, for %s, skipping." % (e, dcm_file) # Print summary messages all at once for message in messages: bot.warning(message) if len(study_dates) > 1: message = "% study dates found for %s" % (len(study_dates), dcm_file) batch = add_batch_error(message, batch) # Save batch thus far batch.qa['StudyDate'] = study_dates batch.qa['StartTime'] = start_time batch.qa['SizeBytes'] = size_bytes batch.save() # If there were no errors on import, we should remove the directory #if not batch.has_error: # Should only be called given no error, and should trigger error if not empty #os.rmdir(dicom_dir) # At the end, submit the dicoms to be anonymized as a batch count = batch.image_set.count() if count > 0: if ANONYMIZE_PIXELS is True: bot.warning( "Anonimization of pixels is not yet implemented. Images were skipped." ) # When this is implemented, the function will be modified to add these images # to the batch, which will then be first sent through a function to # scrub pixels before header data is looked at. # scrub_pixels(bid=batch.id) #else: if run_get_identifiers is True: bot.debug("get_identifiers submit batch %s with %s dicoms." % (batch.uid, count)) return get_identifiers(bid=batch.id) else: bot.debug("Finished batch %s with %s dicoms" % (batch.uid, count)) return batch else: # No images for further processing batch.status = "EMPTY" batch.qa['FinishTime'] = time.time() message = "%s is flagged EMPTY, no images pass filter" % (batch.id) batch = add_batch_warning(message, batch) batch.save() return else: bot.warning('Cannot find %s' % dicom_dir)
def get_identifiers(bid, study=None, run_replace_identifiers=True): '''get identifiers is the celery task to get identifiers for all images in a batch. A batch is a set of dicom files that may include more than one series/study. This is done by way of sending one restful call to the DASHER endpoint. If ANONYMIZE_RESTFUL is False under settings, this function doesn't run ''' batch = Batch.objects.get(id=bid) if study is None: study = SOM_STUDY if ANONYMIZE_RESTFUL is True: images = batch.image_set.all() # Process all dicoms at once, one call to the API dicom_files = batch.get_image_paths() batch.change_images_status('PROCESSING') batch.save() # redundant try: ids = get_ids(dicom_files=dicom_files, expand_sequences=False ) # we are uploading a zip, doesn't make sense # to preserve image level metadata except FileNotFoundError: batch.status = "ERROR" message = "batch %s is missing dicom files and should be reprocessed" % ( batch.id) batch = add_batch_warning(message, batch) batch.save() # Prepare identifiers with only minimal required # This function expects many items for one entity, returns # request['identifiers'] --> [ entity-with-study-item ] request = prepare_identifiers_request(ids) # force: True bot.debug("som.client making request to anonymize batch %s" % (bid)) # Run with retrying, in case issue with token refresh result = None try: result = run_client(study, request) except: # But any error, don't continue, don't launch new job message = "error with client, stopping job." batch = add_batch_error(message, batch) batch.status = "ERROR" batch.qa['FinishTime'] = time.time() batch.save() # Create a batch for all results if result is not None: if "results" in result: batch_ids, created = BatchIdentifiers.objects.get_or_create( batch=batch) batch_ids.response = result['results'] batch_ids.ids = ids batch_ids.save() batch.qa['DasherFinishTime'] = time.time() if run_replace_identifiers is True: return replace_identifiers(bid=bid) else: return batch_ids else: message = "'results' field not found in response: %s" % result batch = add_batch_error(message, batch) else: bot.debug( "Restful de-identification skipped [ANONYMIZE_RESTFUL is False]") change_status(batch, "DONEPROCESSING") change_status(batch.image_set.all(), "DONEPROCESSING")