def get_notifier(): '''get notifier will return a basic pyinotify watch manager based on the user's inotify watch paths in settings. if there is an error, returns None. ''' try: import pyinotify except ImportError as e: bot.error("pyinotify is not installed.") return None level = get_level() wm = pyinotify.WatchManager() for path, mask, processor_cls in settings.INOTIFIER_WATCH_PATHS: cls_path = '.'.join(processor_cls.split('.')[0:-1]) cls = processor_cls.split('.')[-1] mod = __import__(cls_path, globals(), locals(), [cls], level) Processor = getattr(mod, cls) wm.add_watch(path, mask, proc_fun=Processor()) bot.debug("Adding watch on %s, processed by %s" % (path, processor_cls)) notifier = pyinotify.Notifier(wm) return notifier
def update_cached(subfolder=None): ''' update the queue (batch object with status QUEUE), intended to be run when there are new folders to find and queue. First preference goes to a folder supplied to the function, then to application defaults. We return None if the result is None. ''' CHECK_FOLDERS = None # First preference goes to variable given at runtime if subfolder is not None: CHECK_FOLDERS = subfolder # Second preference goes to DATA_INPUT_FOLDERS if DATA_INPUT_FOLDERS not in ['', None]: CHECK_FOLDERS = DATA_INPUT_FOLDERS # Final preference goes to data subfolder. We don't parse root. # The base of data has directories that need to be organized if CHECK_FOLDERS is None: if DATA_SUBFOLDER is not None: CHECK_FOLDERS = "%s/%s" % (DATA_BASE, DATA_SUBFOLDER) else: bot.error( "Specify DATA_INPUT_FOLDERS in settings for cached jobs.") return if not isinstance(CHECK_FOLDERS, list): CHECK_FOLDERS = [CHECK_FOLDERS] count = 0 current = [x.uid for x in Batch.objects.all()] for base in CHECK_FOLDERS: print('Checking base %s' % base) if os.path.exists(base) and os.path.isdir(base): # If it's not a date if not re.search('[0-9]{10}$', base): contenders = [base] else: contenders = get_contenders(base=base, current=current) for contender in contenders: dicom_dir = "%s/%s" % (base, contender) dcm_folder = os.path.basename(dicom_dir) batch, created = Batch.objects.get_or_create(uid=dcm_folder) if created is True: batch.status = "QUEUE" batch.logs['DICOM_DIR'] = dicom_dir count += 1 batch.save() print("Added %s contenders for processing queue." % count)
def clean_up(bid, remove_batch=False): '''clean up will check a batch for errors, and if none exist, clear the entries from the database. If no errors occurred, the original folder would have been deleted after dicom import. ''' try: batch = Batch.objects.get(id=bid) except: bot.error("In clean_up: Batch %s does not exist." %(bid)) return None # force clean up for now, we don't have much server space has_error = batch.has_error has_error = False if not has_error: images = batch.image_set.all() [x.image.delete() for x in images] # deletes image files [x.delete() for x in images] # deletes objects if remove_batch is True: batch.delete() #django-cleanup will delete files on delete else: bot.warning("Batch %s has error, will not be cleaned up." %batch.id)
def import_dicomdir(dicom_dir, run_get_identifiers=True): '''import dicom directory manages importing a valid dicom set into the application, and is a celery job triggered by the watcher. Here we also flag (and disclude) images that have a header value that indicates pixel identifiers. ''' start_time = time.time() if os.path.exists(dicom_dir): try: dicom_files = ls_fullpath(dicom_dir) except NotADirectoryError: bot.error('%s is not a directory, skipping.' % dicom_dir) return bot.debug("Importing %s, found %s .dcm files" % (dicom_dir, len(dicom_files))) # The batch --> the folder with a set of dicoms tied to one request dcm_folder = os.path.basename(dicom_dir) batch, created = Batch.objects.get_or_create(uid=dcm_folder) batch.logs['STARTING_IMAGE_COUNT'] = len(dicom_files) # Data quality check: keep a record of study dates study_dates = dict() size_bytes = sum(os.path.getsize(f) for f in dicom_files) messages = [] # print all unique messages / warnings at end # Add in each dicom file to the series for dcm_file in dicom_files: try: # The dicom folder will be named based on the accession# dcm = read_file(dcm_file, force=True) dicom_uid = os.path.basename(dcm_file) # Keep track of studyDate study_date = dcm.get('StudyDate') if study_date not in study_dates: study_dates[study_date] = 0 study_dates[study_date] += 1 flag, flag_group, reason = has_burned_pixels( dicom_file=dcm_file, quiet=True, deid=STUDY_DEID) # If the image is flagged, we don't include and move on continue_processing = True if flag is True: if flag_group not in ["whitelist"]: continue_processing = False message = "%s is flagged in %s: %s, skipping" % ( dicom_uid, flag_group, reason) batch = add_batch_warning(message, batch, quiet=True) message = "BurnedInAnnotation found for batch %s" % batch.uid if message not in messages: messages.append(message) if continue_processing is True: # Create the Image object in the database # A dicom instance number must be unique for its batch dicom = Image.objects.create(batch=batch, uid=dicom_uid) # Save the dicom file to storage # basename = "%s/%s" %(batch.id,os.path.basename(dcm_file)) dicom = save_image_dicom(dicom=dicom, dicom_file=dcm_file) # Also saves # Generate image name based on [SUID] added later # accessionnumberSUID.seriesnumber.imagenumber, name = "%s_%s.dcm" % (dcm.get('SeriesNumber'), dcm.get('InstanceNumber')) dicom.name = name dicom.save() # Only remove files successfully imported #os.remove(dcm_file) # Note that on error we don't remove files except InvalidDicomError: message = "InvalidDicomError: %s skipping." % (dcm_file) batch = add_batch_error(message, batch) except KeyError: message = "KeyError: %s is possibly invalid, skipping." % ( dcm_file) batch = add_batch_error(message, batch) except Exception as e: message = "Exception: %s, for %s, skipping." % (e, dcm_file) # Print summary messages all at once for message in messages: bot.warning(message) if len(study_dates) > 1: message = "% study dates found for %s" % (len(study_dates), dcm_file) batch = add_batch_error(message, batch) # Save batch thus far batch.qa['StudyDate'] = study_dates batch.qa['StartTime'] = start_time batch.qa['SizeBytes'] = size_bytes batch.save() # If there were no errors on import, we should remove the directory #if not batch.has_error: # Should only be called given no error, and should trigger error if not empty #os.rmdir(dicom_dir) # At the end, submit the dicoms to be anonymized as a batch count = batch.image_set.count() if count > 0: if ANONYMIZE_PIXELS is True: bot.warning( "Anonimization of pixels is not yet implemented. Images were skipped." ) # When this is implemented, the function will be modified to add these images # to the batch, which will then be first sent through a function to # scrub pixels before header data is looked at. # scrub_pixels(bid=batch.id) #else: if run_get_identifiers is True: bot.debug("get_identifiers submit batch %s with %s dicoms." % (batch.uid, count)) return get_identifiers(bid=batch.id) else: bot.debug("Finished batch %s with %s dicoms" % (batch.uid, count)) return batch else: # No images for further processing batch.status = "EMPTY" batch.qa['FinishTime'] = time.time() message = "%s is flagged EMPTY, no images pass filter" % (batch.id) batch = add_batch_warning(message, batch) batch.save() return else: bot.warning('Cannot find %s' % dicom_dir)
def upload_storage(batch_ids=None): '''upload storage will as a batch, send all batches with DONEPROCESSING status to google cloud storage. ''' from sendit.settings import (GOOGLE_CLOUD_STORAGE, SEND_TO_GOOGLE, GOOGLE_PROJECT_NAME, GOOGLE_STORAGE_COLLECTION) if batch_ids is None: batches = Batch.objects.filter(status="DONEPROCESSING") else: batches = Batch.objects.filter(status="DONEPROCESSING", id__in=batch_ids) # All variables must be defined for sending! if GOOGLE_CLOUD_STORAGE in [None,""]: SEND_TO_GOOGLE = False if GOOGLE_PROJECT_NAME in [None,""]: SEND_TO_GOOGLE = False if GOOGLE_STORAGE_COLLECTION in [None,""]: SEND_TO_GOOGLE = False if SEND_TO_GOOGLE is True: from deid.identifiers import get_timestamp try: client = get_client(bucket_name=GOOGLE_CLOUD_STORAGE, project_name=GOOGLE_PROJECT_NAME) # Client is unreachable, usually network is being stressed # this is why we instantiate in batches to upload except: #OSError and ServiceUnavailable bot.error("Cannot connect to client.") return # Create/get BigQuery dataset, collection should be IRB dataset = client.get_or_create_dataset(GOOGLE_STORAGE_COLLECTION) # Create a table based on ... table = client.get_or_create_table(dataset=dataset, # All tables named dicom table_name='dicom', schema=dicom_schema) for batch in batches: valid = True batch.qa['UploadStartTime'] = time.time() batch_ids = BatchIdentifiers.objects.get(batch=batch) # Retrieve only images that aren't in PHI folder images = batch.get_finished() # Stop if no images pass filters if len(images) == 0: change_status(batch,"EMPTY") message = "batch %s has no images for processing, stopping upload" %(bid) batch = add_batch_warning(message,batch) batch.save() continue # IR0001fa6_20160525_IR661B54.tar.gz # (coded MRN?)_jittereddate_studycode required_fields = ['AccessionNumber', 'PatientID'] for required_field in required_fields: if required_field not in batch_ids.shared: change_status(batch,"ERROR") message = "batch ids %s do not have shared PatientID or AccessionNumber, stopping upload" %(bid) batch = add_batch_warning(message,batch) batch.save() valid = False if valid is False: continue # Add additional shared metadata studycode = batch_ids.shared['AccessionNumber'] coded_mrn = batch_ids.shared['PatientID'] batch_ids.shared['CodedPatientID'] = coded_mrn batch_ids.shared['ContentType'] = 'application/gzip' batch_ids.shared['CodedAccessionNumberID'] = studycode batch_ids.shared['NumberOfSeries'] = batch.qa['NumberOfSeries'] batch_ids.shared['Series'] = batch.qa['Series'] batch_ids.shared['RemovedSeries'] = batch.qa['FlaggedSeries'] timestamp = get_timestamp(batch_ids.shared['StudyDate'], format = "%Y%m%d") compressed_filename = "%s/%s_%s_%s.tar.gz" %(batch.get_path(), coded_mrn, timestamp, studycode) compressed_file = generate_compressed_file(files=images, # mode="w:gz" filename=compressed_filename) # File will be None if no files added if compressed_file is None: change_status(batch,"ERROR") message = "batch %s problem compressing file, stopping upload" %(bid) batch = add_batch_error(message,batch) batch.save() valid = False continue # We prepare shared metadata for one item batch_ids.shared['IMAGE_COUNT'] = len(images) batch.logs['IMAGE_COUNT'] = len(images) batch_ids.save() batch.save() if valid is True: metadata = deepcopy(batch_ids.shared) metadata['DicomHeader'] = json.dumps(metadata) metadata = { compressed_file: metadata } bot.log("Uploading %s with %s images to Google Storage %s" %(os.path.basename(compressed_file), len(images), GOOGLE_CLOUD_STORAGE)) # We only expect to have one entity per batch kwargs = {"items":[compressed_file], "table":table, "study": SOM_STUDY, "metadata": metadata, "batch": False} # upload in batches at END # Batch metadata upload_dataset(client=client, k=kwargs) # Clean up compressed file if os.path.exists(compressed_file): os.remove(compressed_file) # Finish and record time elapsed change_status(batch,"DONE") batch.qa['UploadFinishTime'] = time.time() total_time = batch.qa['UploadFinishTime'] - batch.qa['UploadStartTime'] bot.info("Total time for %s: %s images is %f min" %(batch.uid, batch.image_set.count(), total_time/60)) batch.qa['ElapsedTime'] = total_time batch.save() # After image upload, metadata can be uploaded on one batch # If this isn't optimal, change "batch" in kwargs to False return client.batch.runInsert(table)
def upload_storage(batch_ids=None): '''upload storage will as a batch, send all batches with DONEPROCESSING status to google cloud storage. ''' from sendit.settings import (GOOGLE_CLOUD_STORAGE, SEND_TO_GOOGLE, GOOGLE_PROJECT_NAME, GOOGLE_PROJECT_ID_HEADER, GOOGLE_STORAGE_COLLECTION) if batch_ids is None: batches = Batch.objects.filter(status="DONEPROCESSING") else: batches = Batch.objects.filter(status="DONEPROCESSING", id__in=batch_ids) # All variables must be defined for sending! if GOOGLE_CLOUD_STORAGE in [None, ""]: SEND_TO_GOOGLE = False if GOOGLE_PROJECT_NAME in [None, ""]: SEND_TO_GOOGLE = False if GOOGLE_STORAGE_COLLECTION in [None, ""]: SEND_TO_GOOGLE = False if SEND_TO_GOOGLE is True: from deid.identifiers import get_timestamp # I'm not sure we need this #if GOOGLE_PROJECT_ID_HEADER is not None: # client.headers["x-goog-project-id"] = GOOGLE_PROJECT_ID_HEADER try: client = get_client(bucket_name=GOOGLE_CLOUD_STORAGE, project_name=GOOGLE_PROJECT_NAME) # Client is unreachable, usually network is being stressed except: #OSError and ServiceUnavailable bot.error("Cannot connect to client.") return collection = client.create_collection(uid=GOOGLE_STORAGE_COLLECTION) for batch in batches: valid = True batch_ids = BatchIdentifiers.objects.get(batch=batch) # Retrieve only images that aren't in PHI folder images = batch.get_finished() # Stop if no images pass filters if len(images) == 0: change_status(batch, "EMPTY") message = "batch %s has no images for processing, stopping upload" % ( batch.id) batch = add_batch_warning(message, batch) batch.save() continue # IR0001fa6_20160525_IR661B54.tar.gz # (coded MRN?)_jittereddate_studycode required_fields = ['AccessionNumber', 'PatientID'] for required_field in required_fields: if required_field not in batch_ids.shared: change_status(batch, "ERROR") message = "batch ids %s do not have shared PatientID or AccessionNumber, stopping upload" % ( bid) batch = add_batch_warning(message, batch) batch.save() valid = False if valid is False: continue studycode = batch_ids.shared['AccessionNumber'] coded_mrn = batch_ids.shared['PatientID'] timestamp = get_timestamp(batch_ids.shared['StudyDate'], format="%Y%m%d") compressed_filename = "%s/%s_%s_%s.tar.gz" % ( batch.get_path(), coded_mrn, timestamp, studycode) compressed_file = generate_compressed_file( files=images, # mode="w:gz" filename=compressed_filename) # File will be None if no files added if compressed_file is None: change_status(batch, "ERROR") message = "batch %s problem compressing file, stopping upload" % ( bid) batch = add_batch_error(message, batch) batch.save() valid = False continue # We prepare shared metadata for one item batch_ids.shared['IMAGE_COUNT'] = len(images) batch.logs['IMAGE_COUNT'] = len(images) batch_ids.save() batch.save() if valid is True: items_metadata = batch_ids.shared items = {compressed_file: items_metadata} cleaned = deepcopy(batch_ids.cleaned) metadata = prepare_entity_metadata(cleaned_ids=cleaned) bot.log("Uploading %s with %s images to Google Storage %s" % (os.path.basename(compressed_file), len(images), GOOGLE_CLOUD_STORAGE)) # We only expect to have one entity per batch uid = list(metadata.keys())[0] kwargs = { "images": [compressed_file], "collection": collection, "uid": uid, "entity_metadata": metadata[uid], "images_metadata": items } # Batch metadata upload_dataset(client=client, k=kwargs) # Clean up compressed file if os.path.exists(compressed_file): os.remove(compressed_file) # Finish and record time elapsed change_status(batch, "DONE") batch.qa['FinishTime'] = time.time() total_time = batch.qa['FinishTime'] - batch.qa['StartTime'] bot.info("Total time for %s: %s images is %f min" % (batch.uid, batch.image_set.count(), total_time / 60)) batch.qa['ElapsedTime'] = total_time batch.save()