def prepare_identifiers_request(ids, force=True): '''prepare identifiers request takes in the output from deid get_identifiers, and returns the minimal request to send to DASHER. The assumption is that one batch of ids ==> one entity / Accession Number ''' # Enforce application default entity_source = entity_options['id_source']['name'] entity_field = entity_options['id_source']['field'] item_source = item_options['id_source']['name'] item_field = item_options['id_source']['field'] entity_times = entity_options['id_timestamp'] item_times = item_options['id_timestamp'] # Entity --> Patient entity = {"id_source": entity_source, "items": []} # Item --> Study (to represent all images) new_item = {"id_source": item_source} for item_id, item in ids.items(): # Entity ID if "id" not in entity: entity_id = item[entity_field].replace('-', '') entity["id"] = entity_id # Entity timestamp if "id_timestamp" not in entity: entity_ts = get_listfirst(item=item, group=entity_times['date']) # 20021202 if entity_ts is not None: timestamp = get_timestamp( item_date=entity_ts) # 2002-12-02T00:00:00Z entity['id_timestamp'] = timestamp # Study Timestamp if "id_timestamp" not in new_item: item_ts = get_listfirst(item=item, group=item_times['date']) if item_ts is not None: timestamp = get_timestamp(item_date=item_ts) new_item["id_timestamp"] = timestamp # Study ID (accession#) if "id" not in new_item: new_item["id"] = item[item_field] # We are only including one study item to represent all images entity["items"].append(new_item) # Expected format of dasher ids = {"identifiers": [entity]} return ids
def get_entity_timestamp(dicom, date_field=None): '''get_entity_timestamp will return a timestamp from the dicom header based on the PatientBirthDate (default) if a field is not provided.''' if date_field is None: date_field = "PatientBirthDate" item_date = dicom.get(date_field) return get_timestamp(item_date=item_date)
def jitter_timestamp(field, value, item): '''if present, jitter a timestamp in dicom field "field" by number of days specified by "value" The value can be positive or negative. ''' value = to_int(value) original = item.get(field, None) if original is not None: jittered = get_timestamp(item_date=original, jitter_days=value, format="%Y%m%d") bot.debug("JITTER %s + (%s): %s" % (original, value, jittered)) item[field] = jittered return item
def get_item_timestamp(dicom, date_field=None, time_field=None): '''get_dicom_timestamp will return the UTC time for an instance. This is derived from the InstanceCreationDate and InstanceCreationTime If the Time is not set, only the date is used. # testing function https://gist.github.com/vsoch/23d6b313bd231cad855877dc544c98ed ''' if time_field is None: time_field = "InstanceCreationTime" if date_field is None: date_field = "InstanceCreationDate" item_time = dicom.get(time_field, "") item_date = dicom.get(date_field) return get_timestamp(item_date=item_date, item_time=item_time)
def upload_storage(batch_ids=None): '''upload storage will as a batch, send all batches with DONEPROCESSING status to google cloud storage. ''' from sendit.settings import (GOOGLE_CLOUD_STORAGE, SEND_TO_GOOGLE, GOOGLE_PROJECT_NAME, GOOGLE_STORAGE_COLLECTION) if batch_ids is None: batches = Batch.objects.filter(status="DONEPROCESSING") else: batches = Batch.objects.filter(status="DONEPROCESSING", id__in=batch_ids) # All variables must be defined for sending! if GOOGLE_CLOUD_STORAGE in [None,""]: SEND_TO_GOOGLE = False if GOOGLE_PROJECT_NAME in [None,""]: SEND_TO_GOOGLE = False if GOOGLE_STORAGE_COLLECTION in [None,""]: SEND_TO_GOOGLE = False if SEND_TO_GOOGLE is True: from deid.identifiers import get_timestamp try: client = get_client(bucket_name=GOOGLE_CLOUD_STORAGE, project_name=GOOGLE_PROJECT_NAME) # Client is unreachable, usually network is being stressed # this is why we instantiate in batches to upload except: #OSError and ServiceUnavailable bot.error("Cannot connect to client.") return # Create/get BigQuery dataset, collection should be IRB dataset = client.get_or_create_dataset(GOOGLE_STORAGE_COLLECTION) # Create a table based on ... table = client.get_or_create_table(dataset=dataset, # All tables named dicom table_name='dicom', schema=dicom_schema) for batch in batches: valid = True batch.qa['UploadStartTime'] = time.time() batch_ids = BatchIdentifiers.objects.get(batch=batch) # Retrieve only images that aren't in PHI folder images = batch.get_finished() # Stop if no images pass filters if len(images) == 0: change_status(batch,"EMPTY") message = "batch %s has no images for processing, stopping upload" %(bid) batch = add_batch_warning(message,batch) batch.save() continue # IR0001fa6_20160525_IR661B54.tar.gz # (coded MRN?)_jittereddate_studycode required_fields = ['AccessionNumber', 'PatientID'] for required_field in required_fields: if required_field not in batch_ids.shared: change_status(batch,"ERROR") message = "batch ids %s do not have shared PatientID or AccessionNumber, stopping upload" %(bid) batch = add_batch_warning(message,batch) batch.save() valid = False if valid is False: continue # Add additional shared metadata studycode = batch_ids.shared['AccessionNumber'] coded_mrn = batch_ids.shared['PatientID'] batch_ids.shared['CodedPatientID'] = coded_mrn batch_ids.shared['ContentType'] = 'application/gzip' batch_ids.shared['CodedAccessionNumberID'] = studycode batch_ids.shared['NumberOfSeries'] = batch.qa['NumberOfSeries'] batch_ids.shared['Series'] = batch.qa['Series'] batch_ids.shared['RemovedSeries'] = batch.qa['FlaggedSeries'] timestamp = get_timestamp(batch_ids.shared['StudyDate'], format = "%Y%m%d") compressed_filename = "%s/%s_%s_%s.tar.gz" %(batch.get_path(), coded_mrn, timestamp, studycode) compressed_file = generate_compressed_file(files=images, # mode="w:gz" filename=compressed_filename) # File will be None if no files added if compressed_file is None: change_status(batch,"ERROR") message = "batch %s problem compressing file, stopping upload" %(bid) batch = add_batch_error(message,batch) batch.save() valid = False continue # We prepare shared metadata for one item batch_ids.shared['IMAGE_COUNT'] = len(images) batch.logs['IMAGE_COUNT'] = len(images) batch_ids.save() batch.save() if valid is True: metadata = deepcopy(batch_ids.shared) metadata['DicomHeader'] = json.dumps(metadata) metadata = { compressed_file: metadata } bot.log("Uploading %s with %s images to Google Storage %s" %(os.path.basename(compressed_file), len(images), GOOGLE_CLOUD_STORAGE)) # We only expect to have one entity per batch kwargs = {"items":[compressed_file], "table":table, "study": SOM_STUDY, "metadata": metadata, "batch": False} # upload in batches at END # Batch metadata upload_dataset(client=client, k=kwargs) # Clean up compressed file if os.path.exists(compressed_file): os.remove(compressed_file) # Finish and record time elapsed change_status(batch,"DONE") batch.qa['UploadFinishTime'] = time.time() total_time = batch.qa['UploadFinishTime'] - batch.qa['UploadStartTime'] bot.info("Total time for %s: %s images is %f min" %(batch.uid, batch.image_set.count(), total_time/60)) batch.qa['ElapsedTime'] = total_time batch.save() # After image upload, metadata can be uploaded on one batch # If this isn't optimal, change "batch" in kwargs to False return client.batch.runInsert(table)
def upload_storage(batch_ids=None): '''upload storage will as a batch, send all batches with DONEPROCESSING status to google cloud storage. ''' from sendit.settings import (GOOGLE_CLOUD_STORAGE, SEND_TO_GOOGLE, GOOGLE_PROJECT_NAME, GOOGLE_PROJECT_ID_HEADER, GOOGLE_STORAGE_COLLECTION) if batch_ids is None: batches = Batch.objects.filter(status="DONEPROCESSING") else: batches = Batch.objects.filter(status="DONEPROCESSING", id__in=batch_ids) # All variables must be defined for sending! if GOOGLE_CLOUD_STORAGE in [None, ""]: SEND_TO_GOOGLE = False if GOOGLE_PROJECT_NAME in [None, ""]: SEND_TO_GOOGLE = False if GOOGLE_STORAGE_COLLECTION in [None, ""]: SEND_TO_GOOGLE = False if SEND_TO_GOOGLE is True: from deid.identifiers import get_timestamp # I'm not sure we need this #if GOOGLE_PROJECT_ID_HEADER is not None: # client.headers["x-goog-project-id"] = GOOGLE_PROJECT_ID_HEADER try: client = get_client(bucket_name=GOOGLE_CLOUD_STORAGE, project_name=GOOGLE_PROJECT_NAME) # Client is unreachable, usually network is being stressed except: #OSError and ServiceUnavailable bot.error("Cannot connect to client.") return collection = client.create_collection(uid=GOOGLE_STORAGE_COLLECTION) for batch in batches: valid = True batch_ids = BatchIdentifiers.objects.get(batch=batch) # Retrieve only images that aren't in PHI folder images = batch.get_finished() # Stop if no images pass filters if len(images) == 0: change_status(batch, "EMPTY") message = "batch %s has no images for processing, stopping upload" % ( batch.id) batch = add_batch_warning(message, batch) batch.save() continue # IR0001fa6_20160525_IR661B54.tar.gz # (coded MRN?)_jittereddate_studycode required_fields = ['AccessionNumber', 'PatientID'] for required_field in required_fields: if required_field not in batch_ids.shared: change_status(batch, "ERROR") message = "batch ids %s do not have shared PatientID or AccessionNumber, stopping upload" % ( bid) batch = add_batch_warning(message, batch) batch.save() valid = False if valid is False: continue studycode = batch_ids.shared['AccessionNumber'] coded_mrn = batch_ids.shared['PatientID'] timestamp = get_timestamp(batch_ids.shared['StudyDate'], format="%Y%m%d") compressed_filename = "%s/%s_%s_%s.tar.gz" % ( batch.get_path(), coded_mrn, timestamp, studycode) compressed_file = generate_compressed_file( files=images, # mode="w:gz" filename=compressed_filename) # File will be None if no files added if compressed_file is None: change_status(batch, "ERROR") message = "batch %s problem compressing file, stopping upload" % ( bid) batch = add_batch_error(message, batch) batch.save() valid = False continue # We prepare shared metadata for one item batch_ids.shared['IMAGE_COUNT'] = len(images) batch.logs['IMAGE_COUNT'] = len(images) batch_ids.save() batch.save() if valid is True: items_metadata = batch_ids.shared items = {compressed_file: items_metadata} cleaned = deepcopy(batch_ids.cleaned) metadata = prepare_entity_metadata(cleaned_ids=cleaned) bot.log("Uploading %s with %s images to Google Storage %s" % (os.path.basename(compressed_file), len(images), GOOGLE_CLOUD_STORAGE)) # We only expect to have one entity per batch uid = list(metadata.keys())[0] kwargs = { "images": [compressed_file], "collection": collection, "uid": uid, "entity_metadata": metadata[uid], "images_metadata": items } # Batch metadata upload_dataset(client=client, k=kwargs) # Clean up compressed file if os.path.exists(compressed_file): os.remove(compressed_file) # Finish and record time elapsed change_status(batch, "DONE") batch.qa['FinishTime'] = time.time() total_time = batch.qa['FinishTime'] - batch.qa['StartTime'] bot.info("Total time for %s: %s images is %f min" % (batch.uid, batch.image_set.count(), total_time / 60)) batch.qa['ElapsedTime'] = total_time batch.save()