Exemple #1
0
def prepare_identifiers_request(ids, force=True):
    '''prepare identifiers request takes in the output from deid 
    get_identifiers, and returns the minimal request to send to DASHER.
    The assumption is that one batch of ids ==> one entity / Accession Number
    '''

    # Enforce application default
    entity_source = entity_options['id_source']['name']
    entity_field = entity_options['id_source']['field']
    item_source = item_options['id_source']['name']
    item_field = item_options['id_source']['field']
    entity_times = entity_options['id_timestamp']
    item_times = item_options['id_timestamp']

    # Entity --> Patient
    entity = {"id_source": entity_source, "items": []}

    # Item --> Study (to represent all images)
    new_item = {"id_source": item_source}
    for item_id, item in ids.items():

        # Entity ID
        if "id" not in entity:
            entity_id = item[entity_field].replace('-', '')
            entity["id"] = entity_id

        # Entity timestamp
        if "id_timestamp" not in entity:
            entity_ts = get_listfirst(item=item,
                                      group=entity_times['date'])  # 20021202
            if entity_ts is not None:
                timestamp = get_timestamp(
                    item_date=entity_ts)  # 2002-12-02T00:00:00Z
                entity['id_timestamp'] = timestamp

        # Study Timestamp
        if "id_timestamp" not in new_item:
            item_ts = get_listfirst(item=item, group=item_times['date'])
            if item_ts is not None:
                timestamp = get_timestamp(item_date=item_ts)
                new_item["id_timestamp"] = timestamp

        # Study ID (accession#)
        if "id" not in new_item:
            new_item["id"] = item[item_field]

    # We are only including one study item to represent all images
    entity["items"].append(new_item)

    # Expected format of dasher
    ids = {"identifiers": [entity]}
    return ids
Exemple #2
0
def get_entity_timestamp(dicom, date_field=None):
    '''get_entity_timestamp will return a timestamp from the dicom
    header based on the PatientBirthDate (default) if a field is
    not provided.'''
    if date_field is None:
        date_field = "PatientBirthDate"
    item_date = dicom.get(date_field)
    return get_timestamp(item_date=item_date)
Exemple #3
0
def jitter_timestamp(field, value, item):
    '''if present, jitter a timestamp in dicom
    field "field" by number of days specified by "value"
    The value can be positive or negative.
    '''
    value = to_int(value)
    original = item.get(field, None)
    if original is not None:
        jittered = get_timestamp(item_date=original,
                                 jitter_days=value,
                                 format="%Y%m%d")
        bot.debug("JITTER %s + (%s): %s" % (original, value, jittered))
        item[field] = jittered
    return item
Exemple #4
0
def get_item_timestamp(dicom, date_field=None, time_field=None):
    '''get_dicom_timestamp will return the UTC time for an instance.
    This is derived from the InstanceCreationDate and InstanceCreationTime
    If the Time is not set, only the date is used.
    # testing function https://gist.github.com/vsoch/23d6b313bd231cad855877dc544c98ed
    '''
    if time_field is None:
        time_field = "InstanceCreationTime"
    if date_field is None:
        date_field = "InstanceCreationDate"

    item_time = dicom.get(time_field, "")
    item_date = dicom.get(date_field)

    return get_timestamp(item_date=item_date, item_time=item_time)
Exemple #5
0
def upload_storage(batch_ids=None):
    '''upload storage will as a batch, send all batches with DONEPROCESSING status
    to google cloud storage.
    '''
    from sendit.settings import (GOOGLE_CLOUD_STORAGE,
                                 SEND_TO_GOOGLE,
                                 GOOGLE_PROJECT_NAME,
                                 GOOGLE_STORAGE_COLLECTION)

    if batch_ids is None:
        batches = Batch.objects.filter(status="DONEPROCESSING")
    else:
        batches = Batch.objects.filter(status="DONEPROCESSING", id__in=batch_ids)

    # All variables must be defined for sending!
    if GOOGLE_CLOUD_STORAGE in [None,""]:
        SEND_TO_GOOGLE = False

    if GOOGLE_PROJECT_NAME in [None,""]:
        SEND_TO_GOOGLE = False

    if GOOGLE_STORAGE_COLLECTION in [None,""]:
        SEND_TO_GOOGLE = False

    if SEND_TO_GOOGLE is True:
        from deid.identifiers import get_timestamp

        try:
            client = get_client(bucket_name=GOOGLE_CLOUD_STORAGE,
                                project_name=GOOGLE_PROJECT_NAME)

        # Client is unreachable, usually network is being stressed
        # this is why we instantiate in batches to upload 
        except: #OSError and ServiceUnavailable
            bot.error("Cannot connect to client.")
            return

        # Create/get BigQuery dataset, collection should be IRB
        dataset = client.get_or_create_dataset(GOOGLE_STORAGE_COLLECTION)

        # Create a table based on ...
        table = client.get_or_create_table(dataset=dataset,    # All tables named dicom
                                           table_name='dicom',
                                           schema=dicom_schema)
        
        for batch in batches:
            valid = True
            batch.qa['UploadStartTime'] = time.time()
            batch_ids = BatchIdentifiers.objects.get(batch=batch)
            # Retrieve only images that aren't in PHI folder
            images = batch.get_finished()
            # Stop if no images pass filters
            if len(images) == 0:        
                change_status(batch,"EMPTY")
                message = "batch %s has no images for processing, stopping upload" %(bid)
                batch = add_batch_warning(message,batch)
                batch.save()
                continue

            # IR0001fa6_20160525_IR661B54.tar.gz
            # (coded MRN?)_jittereddate_studycode
            required_fields = ['AccessionNumber', 'PatientID']
            for required_field in required_fields:
                if required_field not in batch_ids.shared:
                    change_status(batch,"ERROR")
                    message = "batch ids %s do not have shared PatientID or AccessionNumber, stopping upload" %(bid)
                    batch = add_batch_warning(message,batch)
                    batch.save()
                    valid = False
                if valid is False:
                    continue

            # Add additional shared metadata
            studycode = batch_ids.shared['AccessionNumber']
            coded_mrn = batch_ids.shared['PatientID']
            batch_ids.shared['CodedPatientID'] = coded_mrn
            batch_ids.shared['ContentType'] = 'application/gzip'
            batch_ids.shared['CodedAccessionNumberID'] = studycode
            batch_ids.shared['NumberOfSeries'] = batch.qa['NumberOfSeries']
            batch_ids.shared['Series'] = batch.qa['Series']
            batch_ids.shared['RemovedSeries'] = batch.qa['FlaggedSeries']
            timestamp = get_timestamp(batch_ids.shared['StudyDate'],
                                      format = "%Y%m%d")            
            compressed_filename = "%s/%s_%s_%s.tar.gz" %(batch.get_path(),
                                                         coded_mrn,
                                                         timestamp,
                                                         studycode)
            compressed_file = generate_compressed_file(files=images, # mode="w:gz"
                                                       filename=compressed_filename) 
            # File will be None if no files added
            if compressed_file is None:        
                change_status(batch,"ERROR")
                message = "batch %s problem compressing file, stopping upload" %(bid)
                batch = add_batch_error(message,batch)
                batch.save()
                valid = False
                continue

            # We prepare shared metadata for one item
            batch_ids.shared['IMAGE_COUNT'] = len(images)
            batch.logs['IMAGE_COUNT'] = len(images)
            batch_ids.save()
            batch.save()
            if valid is True:
                metadata = deepcopy(batch_ids.shared)
                metadata['DicomHeader'] = json.dumps(metadata)
                metadata = { compressed_file: metadata }
                bot.log("Uploading %s with %s images to Google Storage %s" %(os.path.basename(compressed_file),
                                                                         len(images),
                                                                         GOOGLE_CLOUD_STORAGE))
                # We only expect to have one entity per batch
                kwargs = {"items":[compressed_file],
                          "table":table,
                          "study": SOM_STUDY,
                          "metadata": metadata,
                          "batch": False} # upload in batches at END

                # Batch metadata    
                upload_dataset(client=client, k=kwargs)

                # Clean up compressed file
                if os.path.exists(compressed_file):
                    os.remove(compressed_file)

                # Finish and record time elapsed
                change_status(batch,"DONE")

            batch.qa['UploadFinishTime'] = time.time()
            total_time = batch.qa['UploadFinishTime'] - batch.qa['UploadStartTime']
            bot.info("Total time for %s: %s images is %f min" %(batch.uid,
                                                                batch.image_set.count(),
                                                                total_time/60))
            batch.qa['ElapsedTime'] = total_time
            batch.save()

        # After image upload, metadata can be uploaded on one batch
        # If this isn't optimal, change "batch" in kwargs to False
        return client.batch.runInsert(table)
Exemple #6
0
def upload_storage(batch_ids=None):
    '''upload storage will as a batch, send all batches with DONEPROCESSING status
    to google cloud storage.
    '''
    from sendit.settings import (GOOGLE_CLOUD_STORAGE, SEND_TO_GOOGLE,
                                 GOOGLE_PROJECT_NAME, GOOGLE_PROJECT_ID_HEADER,
                                 GOOGLE_STORAGE_COLLECTION)

    if batch_ids is None:
        batches = Batch.objects.filter(status="DONEPROCESSING")
    else:
        batches = Batch.objects.filter(status="DONEPROCESSING",
                                       id__in=batch_ids)

    # All variables must be defined for sending!
    if GOOGLE_CLOUD_STORAGE in [None, ""]:
        SEND_TO_GOOGLE = False

    if GOOGLE_PROJECT_NAME in [None, ""]:
        SEND_TO_GOOGLE = False

    if GOOGLE_STORAGE_COLLECTION in [None, ""]:
        SEND_TO_GOOGLE = False

    if SEND_TO_GOOGLE is True:
        from deid.identifiers import get_timestamp

        # I'm not sure we need this
        #if GOOGLE_PROJECT_ID_HEADER is not None:
        #    client.headers["x-goog-project-id"] = GOOGLE_PROJECT_ID_HEADER
        try:
            client = get_client(bucket_name=GOOGLE_CLOUD_STORAGE,
                                project_name=GOOGLE_PROJECT_NAME)
        # Client is unreachable, usually network is being stressed

        except:  #OSError and ServiceUnavailable
            bot.error("Cannot connect to client.")
            return

        collection = client.create_collection(uid=GOOGLE_STORAGE_COLLECTION)
        for batch in batches:
            valid = True
            batch_ids = BatchIdentifiers.objects.get(batch=batch)

            # Retrieve only images that aren't in PHI folder
            images = batch.get_finished()

            # Stop if no images pass filters
            if len(images) == 0:
                change_status(batch, "EMPTY")
                message = "batch %s has no images for processing, stopping upload" % (
                    batch.id)
                batch = add_batch_warning(message, batch)
                batch.save()
                continue

            # IR0001fa6_20160525_IR661B54.tar.gz
            # (coded MRN?)_jittereddate_studycode
            required_fields = ['AccessionNumber', 'PatientID']
            for required_field in required_fields:
                if required_field not in batch_ids.shared:
                    change_status(batch, "ERROR")
                    message = "batch ids %s do not have shared PatientID or AccessionNumber, stopping upload" % (
                        bid)
                    batch = add_batch_warning(message, batch)
                    batch.save()
                    valid = False
                if valid is False:
                    continue

            studycode = batch_ids.shared['AccessionNumber']
            coded_mrn = batch_ids.shared['PatientID']
            timestamp = get_timestamp(batch_ids.shared['StudyDate'],
                                      format="%Y%m%d")

            compressed_filename = "%s/%s_%s_%s.tar.gz" % (
                batch.get_path(), coded_mrn, timestamp, studycode)
            compressed_file = generate_compressed_file(
                files=images,  # mode="w:gz"
                filename=compressed_filename)

            # File will be None if no files added
            if compressed_file is None:
                change_status(batch, "ERROR")
                message = "batch %s problem compressing file, stopping upload" % (
                    bid)
                batch = add_batch_error(message, batch)
                batch.save()
                valid = False
                continue

            # We prepare shared metadata for one item
            batch_ids.shared['IMAGE_COUNT'] = len(images)
            batch.logs['IMAGE_COUNT'] = len(images)
            batch_ids.save()
            batch.save()
            if valid is True:
                items_metadata = batch_ids.shared
                items = {compressed_file: items_metadata}
                cleaned = deepcopy(batch_ids.cleaned)
                metadata = prepare_entity_metadata(cleaned_ids=cleaned)
                bot.log("Uploading %s with %s images to Google Storage %s" %
                        (os.path.basename(compressed_file), len(images),
                         GOOGLE_CLOUD_STORAGE))
                # We only expect to have one entity per batch
                uid = list(metadata.keys())[0]
                kwargs = {
                    "images": [compressed_file],
                    "collection": collection,
                    "uid": uid,
                    "entity_metadata": metadata[uid],
                    "images_metadata": items
                }

                # Batch metadata
                upload_dataset(client=client, k=kwargs)

                # Clean up compressed file
                if os.path.exists(compressed_file):
                    os.remove(compressed_file)

                # Finish and record time elapsed
                change_status(batch, "DONE")

            batch.qa['FinishTime'] = time.time()
            total_time = batch.qa['FinishTime'] - batch.qa['StartTime']
            bot.info("Total time for %s: %s images is %f min" %
                     (batch.uid, batch.image_set.count(), total_time / 60))
            batch.qa['ElapsedTime'] = total_time
            batch.save()