Esempio n. 1
0
def batch_upload(upload: Tuple[dict, str, bytes, str]):
    """ Used for mapping an s3_upload function.  the tuple is unpacked, can only have one parameter. """
    ret = {'exception': None, 'traceback': None}
    try:
        if len(upload) != 4:
            # upload should have length 4; this is for debugging if it doesn't
            print(upload)
        chunk, chunk_path, new_contents, study_object_id = upload
        del upload

        if "b'" in chunk_path:
            raise Exception(chunk_path)

        s3_upload(chunk_path,
                  codecs.decode(new_contents, "zip"),
                  study_object_id,
                  raw_path=True)
        print("data uploaded!", chunk_path)

        if isinstance(chunk, ChunkRegistry):
            # If the contents are being appended to an existing ChunkRegistry object
            chunk.file_size = len(new_contents)
            chunk.update_chunk_hash(new_contents)

        else:
            # If a new ChunkRegistry object is being created
            # Convert the ID's used in the S3 file names into primary keys for making ChunkRegistry FKs
            participant_pk, study_pk = Participant.objects.filter(
                patient_id=chunk['user_id']).values_list('pk',
                                                         'study_id').get()
            if chunk['survey_id']:
                survey_pk = Survey.objects.filter(
                    object_id=chunk['survey_id']).values_list('pk',
                                                              flat=True).get()
            else:
                survey_pk = None

            ChunkRegistry.register_chunked_data(
                chunk['data_type'],
                chunk['time_bin'],
                chunk['chunk_path'],
                new_contents,  # unlikely to be huge
                study_pk,
                participant_pk,
                survey_pk,
            )

    # it broke. print stacktrace for debugging
    except Exception as e:
        traceback.print_exc()
        ret['traceback'] = sys.exc_info()
        ret['exception'] = e

    return ret
Esempio n. 2
0
def handle_database_query(study_id, query, registry=None):
    """
    Runs the database query and returns a QuerySet.
    """
    chunk_fields = ["pk", "participant_id", "data_type", "chunk_path", "time_bin", "chunk_hash",
                    "participant__patient_id", "study_id", "survey_id", "survey__object_id"]

    chunks = ChunkRegistry.get_chunks_time_range(study_id, **query)

    if not registry:
        return chunks.values(*chunk_fields)

    # If there is a registry, we need to filter the chunks
    else:
        # Get all chunks whose path and hash are both in the registry
        possible_registered_chunks = (
            chunks
                .filter(chunk_path__in=registry, chunk_hash__in=registry.values())
                .values('pk', 'chunk_path', 'chunk_hash')
        )

        # determine those chunks that we do not want present in the download
        # (get a list of pks that have hashes that don't match the database)
        registered_chunk_pks = [
            c['pk'] for c in possible_registered_chunks if registry[c['chunk_path']] == c['chunk_hash']
        ]

        # add the exclude and return the queryset
        unregistered_chunks = chunks.exclude(pk__in=registered_chunk_pks)
        return unregistered_chunks.values(*chunk_fields)
def handle_database_query(study_id: int,
                          query_dict: dict,
                          registry_dict: dict = None) -> QuerySet:
    """ Runs the database query and returns a QuerySet. """
    chunks = ChunkRegistry.get_chunks_time_range(study_id, **query_dict)

    if not registry_dict:
        return chunks.values(*chunk_fields)

    # If there is a registry, we need to filter on the chunks
    else:
        # Get all chunks whose path and hash are both in the registry
        possible_registered_chunks = (chunks.filter(
            chunk_path__in=registry_dict,
            chunk_hash__in=registry_dict.values()).values(
                'pk', 'chunk_path', 'chunk_hash'))

        # determine those chunks that we do not want present in the download
        # (get a list of pks that have hashes that don't match the database)
        registered_chunk_pks = [
            c['pk'] for c in possible_registered_chunks
            if registry_dict[c['chunk_path']] == c['chunk_hash']
        ]

        # add the exclude and return the queryset
        return chunks.exclude(pk__in=registered_chunk_pks).values(
            *chunk_fields)
def batch_upload(upload: Tuple[ChunkRegistry or dict, str, bytes, str]):
    """ Used for mapping an s3_upload function.  the tuple is unpacked, can only have one parameter. """

    ret = {'exception': None, 'traceback': None}
    with make_error_sentry(sentry_type=SentryTypes.data_processing):
        try:
            chunk, chunk_path, new_contents, study_object_id = upload
            del upload
            new_contents = decompress(new_contents)

            if "b'" in chunk_path:
                raise Exception(chunk_path)

            # for use with test script to avoid network uploads
            # with open("processing_tests/" + GLOBAL_TIMESTAMP, 'ba') as f:
            #     f.write(b"\n\n")
            #     f.write(new_contents)
            #     return ret

            s3_upload(chunk_path, new_contents, study_object_id, raw_path=True)

            # if the chunk object is a chunk registry then we are updating an old one,
            # otherwise we are creating a new one.
            if isinstance(chunk, ChunkRegistry):
                # If the contents are being appended to an existing ChunkRegistry object
                chunk.file_size = len(new_contents)
                chunk.update_chunk(new_contents)
            else:
                ChunkRegistry.register_chunked_data(**chunk,
                                                    file_contents=new_contents)

        # it broke. print stacktrace for debugging
        except Exception as e:
            traceback.print_exc()
            ret['traceback'] = sys.exc_info()
            ret['exception'] = e

            # using an error sentry we can easily report a real error with a real stack trace! :D
            raise

    return ret
Esempio n. 5
0
def batch_upload(upload):
    """ Used for mapping an s3_upload function. """
    ret = {'exception': None, 'traceback': None}
    try:
        if len(upload) != 4:
            # upload should have length 4; this is for debugging if it doesn't
            print(upload)
        chunk, chunk_path, new_contents, study_object_id = upload
        del upload
        new_contents = new_contents.decode("zip")
        s3_upload(chunk_path, new_contents, study_object_id, raw_path=True)
        print("data uploaded!", chunk_path)
        if isinstance(chunk, ChunkRegistry):
            # If the contents are being appended to an existing ChunkRegistry object
            chunk.low_memory_update_chunk_hash(new_contents)
        else:
            # If a new ChunkRegistry object is being created
            # Convert the ID's used in the S3 file names into primary keys for making ChunkRegistry FKs
            participant_pk, study_pk = Participant.objects.filter(
                patient_id=chunk['user_id']).values_list('pk',
                                                         'study_id').get()
            if chunk['survey_id']:
                survey_pk = Survey.objects.filter(
                    object_id=chunk['survey_id']).values_list('pk',
                                                              flat=True).get()
            else:
                survey_pk = None
            ChunkRegistry.register_chunked_data(
                chunk['data_type'],
                chunk['time_bin'],
                chunk['chunk_path'],
                new_contents,  # unlikely to be huge
                study_pk,
                participant_pk,
                survey_pk,
            )
    except Exception as e:
        ret['traceback'] = format_exc(e)
        ret['exception'] = e
    return ret
Esempio n. 6
0
def process_one_file(
        file_for_processing: FileForProcessing, survey_id_dict: dict, all_binified_data: DefaultDict,
        ftps_to_remove: set
):
    """ This function is the inner loop of the chunking process. """

    if file_for_processing.exception:
        file_for_processing.raise_data_processing_error()

    # there are two cases: chunkable data that can be stuck into "time bins" for each hour, and
    # files that do not need to be "binified" and pretty much just go into the ChunkRegistry unmodified.
    if file_for_processing.chunkable:
        newly_binified_data, survey_id_hash = process_csv_data(file_for_processing)

        # survey answers store the survey id in the file name (truly ancient design decision).
        if file_for_processing.data_type in SURVEY_DATA_FILES:
            survey_id_dict[survey_id_hash] = resolve_survey_id_from_file_name(
                file_for_processing.file_to_process.s3_file_path)

        if newly_binified_data:
            append_binified_csvs(
                all_binified_data, newly_binified_data, file_for_processing.file_to_process
            )
        else:  # delete empty files from FilesToProcess
            ftps_to_remove.add(file_for_processing.file_to_process.id)
        return

    else:
        # case: unchunkable data file
        timestamp = clean_java_timecode(
            file_for_processing.file_to_process.s3_file_path.rsplit("/", 1)[-1][:-4]
        )
        # Since we aren't binning the data by hour, just create a ChunkRegistry that
        # points to the already existing S3 file.
        try:
            ChunkRegistry.register_unchunked_data(
                file_for_processing.data_type,
                timestamp,
                file_for_processing.file_to_process.s3_file_path,
                file_for_processing.file_to_process.study.pk,
                file_for_processing.file_to_process.participant.pk,
                file_for_processing.file_contents,
            )
            ftps_to_remove.add(file_for_processing.file_to_process.id)
        except ValidationError as ve:
            if len(ve.messages) != 1:
                # case: the error case (below) is very specific, we only want that singular error.
                raise

            # case: an unchunkable file was re-uploaded, causing a duplicate file path collision
            # we detect this specific case and update the registry with the new file size
            # (hopefully it doesn't actually change)
            if 'Chunk registry with this Chunk path already exists.' in ve.messages:
                ChunkRegistry.update_registered_unchunked_data(
                    file_for_processing.data_type,
                    file_for_processing.file_to_process.s3_file_path,
                    file_for_processing.file_contents,
                )
                ftps_to_remove.add(file_for_processing.file_to_process.id)
            else:
                # any other errors, add
                raise
Esempio n. 7
0
def do_process_user_file_chunks(count: int, error_handler: ErrorHandler,
                                skip_count: int, participant: Participant):
    """
    Run through the files to process, pull their data, put it into s3 bins. Run the file through
    the appropriate logic path based on file type.

    If a file is empty put its ftp object to the empty_files_list, we can't delete objects
    in-place while iterating over the db.

    All files except for the audio recording files are in the form of CSVs, most of those files
    can be separated by "time bin" (separated into one-hour chunks) and concatenated and sorted
    trivially. A few files, call log, identifier file, and wifi log, require some triage
    beforehand.  The debug log cannot be correctly sorted by time for all elements, because it
    was not actually expected to be used by researchers, but is apparently quite useful.

    Any errors are themselves concatenated using the passed in error handler.

    In a single call to this function, count files will be processed, starting from file number
    skip_count. The first skip_count files are expected to be files that have previously errored
    in file processing.
    """
    # Declare a defaultdict containing a tuple of two double ended queues (deque, pronounced "deck")
    all_binified_data = defaultdict(lambda: (deque(), deque()))
    ftps_to_remove = set()
    # The ThreadPool enables downloading multiple files simultaneously from the network, and continuing
    # to download files as other files are being processed, making the code as a whole run faster.
    pool = ThreadPool(CONCURRENT_NETWORK_OPS)
    survey_id_dict = {}

    # A Django query with a slice (e.g. .all()[x:y]) makes a LIMIT query, so it
    # only gets from the database those FTPs that are in the slice.
    print(participant.as_native_python())
    print(len(participant.files_to_process.exclude(deleted=True).all()))
    print(count)
    print(skip_count)

    files_to_process = participant.files_to_process.exclude(deleted=True).all()

    for data in pool.map(batch_retrieve_for_processing,
                         files_to_process[skip_count:count + skip_count],
                         chunksize=1):
        with error_handler:
            # If we encountered any errors in retrieving the files for processing, they have been
            # lumped together into data['exception']. Raise them here to the error handler and
            # move to the next file.
            if data['exception']:
                print("\n" + data['ftp']['s3_file_path'])
                print(data['traceback'])
                ################################################################
                # YOU ARE SEEING THIS EXCEPTION WITHOUT A STACK TRACE
                # BECAUSE IT OCCURRED INSIDE POOL.MAP ON ANOTHER THREAD
                ################################################################
                raise data['exception']

            if data['chunkable']:
                newly_binified_data, survey_id_hash = process_csv_data(data)
                if data['data_type'] in SURVEY_DATA_FILES:
                    survey_id_dict[
                        survey_id_hash] = resolve_survey_id_from_file_name(
                            data['ftp']["s3_file_path"])

                if newly_binified_data:
                    append_binified_csvs(all_binified_data,
                                         newly_binified_data, data['ftp'])
                else:  # delete empty files from FilesToProcess
                    ftps_to_remove.add(data['ftp']['id'])
                continue

            # if not data['chunkable']
            else:
                timestamp = clean_java_timecode(
                    data['ftp']["s3_file_path"].rsplit("/", 1)[-1][:-4])
                # Since we aren't binning the data by hour, just create a ChunkRegistry that
                # points to the already existing S3 file.
                ChunkRegistry.register_unchunked_data(
                    data['data_type'],
                    timestamp,
                    data['ftp']['s3_file_path'],
                    data['ftp']['study'].pk,
                    data['ftp']['participant'].pk,
                    data['file_contents'],
                )
                ftps_to_remove.add(data['ftp']['id'])

    pool.close()
    pool.terminate()
    more_ftps_to_remove, number_bad_files = upload_binified_data(
        all_binified_data, error_handler, survey_id_dict)
    ftps_to_remove.update(more_ftps_to_remove)
    # Actually delete the processed FTPs from the database
    FileToProcess.objects.filter(pk__in=ftps_to_remove).delete()
    # Garbage collect to free up memory
    gc.collect()
    return number_bad_files
def chunk_file_lambda_handler(event, context):

    error_handler = ErrorHandler()

    # Declare a defaultdict containing a tuple of two double ended queues (deque, pronounced "deck")
    all_binified_data = defaultdict(lambda: (deque(), deque()))
    ftps_to_remove = set()
    # The ThreadPool enables downloading multiple files simultaneously from the network, and continuing
    # to download files as other files are being processed, making the code as a whole run faster.
    survey_id_dict = {}

    for record in event['Records']:
        full_s3_path = record['s3']['object']['key']
        key_values = full_s3_path.split('/')

        # there was a bug that would incorrectly preprend the s3_path with the study_object_id
        # lets try to undo that here
        if 'RAW_DATA' not in key_values[0]:

            key_values = key_values[key_values.index('RAW_DATA'):]

            #logger.error('S3 path {0} does not appear to be in RAW_DATA'.format(full_s3_path))
            #return {
            #'statusCode': 200,
            #'body': json.dumps('False positive!')
            #}

        study_object_id = key_values[1]
        participant_id = key_values[2]

        try:
            participant = Participant.objects.get(patient_id=participant_id)
        except Participant.DoesNotExist as e:
            logger.error(
                'Could not find participant {0} for file to process {1}'.
                format(participant_id, full_s3_path))
            return {'statusCode': 200, 'body': json.dumps('Lambda failed!')}

        # an file with no extension means we may need to create RSA keys for this participant
        # lets investigate!
        _, file_extension = os.path.splitext(full_s3_path)
        if not file_extension:

            # first check to see if a key pair already exists
            key_paths = construct_s3_key_paths(study_object_id, participant_id)
            logger.info(
                'Look to see if keys already exist at: {}'.format(key_paths))

            if s3_exists(key_paths['private'], study_object_id, raw_path=True) and \
               s3_exists(key_paths['public'], study_object_id, raw_path=True):

                logger.error('Key pair already exists for {0}: {1}'.format(
                    study_object_id, participant_id))

                return {
                    'statusCode':
                    200,
                    'body':
                    json.dumps('Key pair already exists for {0}: {1}'.format(
                        study_object_id, participant_id))
                }

            else:
                logger.info('Generating key pair for {0}: {1}'.format(
                    study_object_id, participant_id))
                create_client_key_pair(participant_id, study_object_id)

                return {
                    'statusCode':
                    200,
                    'body':
                    json.dumps('Created key pair for {0}: {1}'.format(
                        study_object_id, participant_id))
                }

        else:

            try:
                file_to_process = participant.files_to_process.get(
                    s3_file_path=full_s3_path)
            except FileToProcess.MultipleObjectsReturned as e:
                # sometimes there are multiple entries on files_to_process with the same s3 path,
                # i am not sure why this happens, but i think that it could be OK just to
                # take the first and delete the others
                the_first_file = True
                for fp in participant.files_to_process.filter(
                        s3_file_path=full_s3_path):
                    if the_first_file == True:
                        file_to_process = fp
                        the_first_file = False
                    else:
                        ftps_to_remove.add(fp.id)
            except FileToProcess.DoesNotExist as e:
                logger.error(
                    'Could not find file to process {0} for participant {1}'.
                    format(full_s3_path, participant_id))
                return {
                    'statusCode': 200,
                    'body': json.dumps('Lambda failed!')
                }

            # some paths were corrupted by prepending the study object id to the correct path, remove this
            if 'RAW_DATA' not in file_to_process.s3_file_path[0:8]:
                path_vals = file_to_process.s3_file_path.split('/')
                file_to_process.s3_file_path = '/'.join(
                    path_vals[path_vals.index('RAW_DATA'):])
                file_to_process.save()

            data = batch_retrieve_for_processing(file_to_process)

            #with error_handler:
            # If we encountered any errors in retrieving the files for processing, they have been
            # lumped together into data['exception']. Raise them here to the error handler and
            # move to the next file.
            if data['exception']:
                logger.error("\n" + data['ftp']['s3_file_path'])
                logger.error(data['traceback'])
                ################################################################
                # YOU ARE SEEING THIS EXCEPTION WITHOUT A STACK TRACE
                # BECAUSE IT OCCURRED INSIDE POOL.MAP, ON ANOTHER THREAD
                ################################################################
                raise data['exception']

            if data['chunkable']:
                # print "1a"
                newly_binified_data, survey_id_hash = process_csv_data(data)
                # print data, "\n1b"
                if data['data_type'] in SURVEY_DATA_FILES:
                    # print survey_id_hash
                    survey_id_dict[
                        survey_id_hash] = resolve_survey_id_from_file_name(
                            data['ftp']["s3_file_path"])

                if newly_binified_data:
                    # print "1c"
                    append_binified_csvs(all_binified_data,
                                         newly_binified_data, data['ftp'])
                else:  # delete empty files from FilesToProcess
                    # print "1d"
                    ftps_to_remove.add(data['ftp']['id'])

            else:  # if not data['chunkable']
                # print "2a"
                timestamp = clean_java_timecode(
                    data['ftp']["s3_file_path"].rsplit("/", 1)[-1][:-4])

                chunked_file_path = construct_s3_chunk_path_from_raw_data_path(
                    data['ftp']['s3_file_path'])

                try:
                    s3_move(data['ftp']['s3_file_path'],
                            chunked_file_path,
                            data['ftp']['study'].object_id,
                            raw_path=True)

                    # print "2a"
                    # Since we aren't binning the data by hour, just create a ChunkRegistry that
                    # points to the already existing S3 file.
                    ChunkRegistry.register_unchunked_data(
                        data['data_type'],
                        timestamp,
                        chunked_file_path,
                        data['ftp']['study'].pk,
                        data['ftp']['participant'].pk,
                    )
                    # print "2b"
                except:
                    print(
                        "Could not find s3 file {0}, removing it as a file to process"
                        .format(data['ftp']['s3_file_path']))

                ftps_to_remove.add(data['ftp']['id'])

    #print(newly_binified_data)
    # print 3
    more_ftps_to_remove, number_bad_files = upload_binified_data(
        all_binified_data, error_handler, survey_id_dict)
    # print "X"
    ftps_to_remove.update(more_ftps_to_remove)
    # Actually delete the processed FTPs from the database
    FileToProcess.objects.filter(pk__in=ftps_to_remove).delete()
    # print "Y"

    # delete the file that triggered the labmda
    #s3_delete(full_s3_path,'',raw_path=True)

    # Garbage collect to free up memory
    gc.collect()
    # print "Z"
    return {'statusCode': 200, 'body': json.dumps('Hello from Lambda!')}
Esempio n. 9
0
pipeline_region = get_current_region()
ssm_client = get_boto_client('ssm', pipeline_region)
error_sentry = make_error_sentry("data",
                                 tags={"pipeline_frequency": "manually"})
batch_client = get_boto_client('batch', pipeline_region)
yesterday = timezone.now() - timedelta(days=1)

refresh_data_access_credentials('manually',
                                ssm_client=ssm_client,
                                webserver=False)

################################################################################################
# if you are running this on an ubuntu machine you have to sudo apt-get -y install cloud-utils #
################################################################################################

for study in Study.objects.all():
    with error_sentry:
        # we only want to run the pipeline for data that has been uploaded, Report all errors to sentry
        for patient_id in ChunkRegistry.get_updated_users_for_study(
                study, yesterday):
            create_one_job('manually',
                           study,
                           patient_id,
                           batch_client,
                           webserver=False)
            print("creating job for", study.name)

# raise errors
if error_sentry.errors:
    error_sentry.raise_errors()
Esempio n. 10
0
def do_process_user_file_chunks(count: int, error_handler: ErrorHandler, skip_count: int,
                                participant: Participant):
    """
    Run through the files to process, pull their data, put it into s3 bins. Run the file through
    the appropriate logic path based on file type.

    If a file is empty put its ftp object to the empty_files_list, we can't delete objects
    in-place while iterating over the db.

    All files except for the audio recording files are in the form of CSVs, most of those files
    can be separated by "time bin" (separated into one-hour chunks) and concatenated and sorted
    trivially. A few files, call log, identifier file, and wifi log, require some triage
    beforehand.  The debug log cannot be correctly sorted by time for all elements, because it
    was not actually expected to be used by researchers, but is apparently quite useful.

    Any errors are themselves concatenated using the passed in error handler.

    In a single call to this function, count files will be processed, starting from file number
    skip_count. The first skip_count files are expected to be files that have previously errored
    in file processing.
    """
    # Declare a defaultdict containing a tuple of two double ended queues (deque, pronounced "deck")
    all_binified_data = defaultdict(lambda: (deque(), deque()))
    ftps_to_remove = set()
    # The ThreadPool enables downloading multiple files simultaneously from the network, and continuing
    # to download files as other files are being processed, making the code as a whole run faster.
    pool = ThreadPool(CONCURRENT_NETWORK_OPS)
    survey_id_dict = {}

    # A Django query with a slice (e.g. .all()[x:y]) makes a LIMIT query, so it
    # only gets from the database those FTPs that are in the slice.
    # print(participant.as_unpacked_native_python())
    print(len(participant.files_to_process.exclude(deleted=True).all()))
    print(count)
    print(skip_count)

    files_to_process = participant.files_to_process.exclude(deleted=True).all()

    for data in pool.map(batch_retrieve_for_processing,
                         files_to_process[skip_count:count+skip_count],
                         chunksize=1):
        with error_handler:
            if data['exception']:
                raise_data_processing_error(data)

            if data['chunkable']:
                # case: chunkable data files
                newly_binified_data, survey_id_hash = process_csv_data(data)
                if data['data_type'] in SURVEY_DATA_FILES:
                    survey_id_dict[survey_id_hash] = resolve_survey_id_from_file_name(data['ftp']["s3_file_path"])

                if newly_binified_data:
                    append_binified_csvs(all_binified_data, newly_binified_data, data['ftp'])
                else:  # delete empty files from FilesToProcess
                    ftps_to_remove.add(data['ftp']['id'])
                continue
            else:
                # case: unchunkable data file
                timestamp = clean_java_timecode(data['ftp']["s3_file_path"].rsplit("/", 1)[-1][:-4])
                # Since we aren't binning the data by hour, just create a ChunkRegistry that
                # points to the already existing S3 file.
                try:
                    ChunkRegistry.register_unchunked_data(
                        data['data_type'],
                        timestamp,
                        data['ftp']['s3_file_path'],
                        data['ftp']['study'].pk,
                        data['ftp']['participant'].pk,
                        data['file_contents'],
                    )
                    ftps_to_remove.add(data['ftp']['id'])
                except ValidationError as ve:
                    if len(ve.messages) != 1:
                        # case: the error case (below) is very specific, we only want that singular error.
                        raise

                    # case: an unchunkable file was re-uploaded, causing a duplicate file path collision
                    # we detect this specific case and update the registry with the new file size
                    # (hopefully it doesn't actually change)
                    if 'Chunk registry with this Chunk path already exists.' in ve.messages:
                        ChunkRegistry.update_registered_unchunked_data(
                            data['data_type'],
                            data['ftp']['s3_file_path'],
                            data['file_contents'],
                        )
                        ftps_to_remove.add(data['ftp']['id'])
                    else:
                        # any other errors, add
                        raise

    pool.close()
    pool.terminate()
    more_ftps_to_remove, number_bad_files = upload_binified_data(all_binified_data, error_handler, survey_id_dict)
    ftps_to_remove.update(more_ftps_to_remove)
    # Actually delete the processed FTPs from the database
    FileToProcess.objects.filter(pk__in=ftps_to_remove).delete()
    # Garbage collect to free up memory
    gc.collect()
    return number_bad_files
Esempio n. 11
0
def migrate_chunk_registries():
    # Calculate the number of chunks that will be used to go through all of MChunkSet()
    
    d_chunk_list = []
    num_registries_handled = 0
    num_bulk_creates = 0
    for i, m_chunk in enumerate(MChunkSet.iterator()):
        if i % 10000 == 0:
            print i, "..."
        with error_handler:
            try:
                d_study_info = study_id_dict[m_chunk.study_id]
            except KeyError:
                msg = 'Study {} referenced in chunk but does not exist, creating it.'.format(m_chunk['study_id'])
                print(msg)
                create_dummy_study(m_chunk.study_id)
                # raise NoSuchDatabaseObject(msg)
            try:
                d_user_info = user_id_dict[m_chunk.user_id]
            except KeyError:
                msg = 'User {} referenced in chunk but does not exist.'.format(m_chunk['user_id'])
                print(msg)
                continue
                # raise NoSuchDatabaseObject(msg)
            
            # some chunks have survey_ids that are string representations of objectids, fix.
            # (and sometimes this can be an empty string, handle that too.)
            if m_chunk.survey_id and isinstance(m_chunk.survey_id, (str, unicode)):
                m_chunk.survey_id = ObjectId(m_chunk.survey_id)
            
            if not m_chunk.survey_id:
                d_survey_pk = None
            elif m_chunk.survey_id in survey_id_dict:
                d_survey_pk = survey_id_dict[m_chunk.survey_id]['pk']
            else:
                print('Survey {} referenced in chunk but does not exist, creating it.'.format(m_chunk.survey_id))
                new_survey = create_dummy_survey(m_chunk.survey_id, m_chunk.study_id)
                d_survey_pk = new_survey.pk

            d_chunk = DChunk(
                is_chunkable=m_chunk.is_chunkable,
                chunk_path=m_chunk.chunk_path,
                chunk_hash=m_chunk.chunk_hash or '',
                data_type=m_chunk.data_type,
                time_bin=m_chunk.time_bin,
                study_id=d_study_info['pk'],
                participant_id=d_user_info['pk'],
                survey_id=d_survey_pk,
                deleted=d_study_info['deleted'],
            )
            
            # d_chunk.full_clean()  # Don't bother full cleaning, it is slow and unnecessary here.
            d_chunk_list.append(d_chunk)
            
            num_registries_handled += 1
            if num_registries_handled % CHUNK_SIZE == 0:
                # Every 10.000 database objects, bulk create and print to stdout
                num_bulk_creates += 1
                if num_bulk_creates % 10 == 0:
                    print(num_bulk_creates * CHUNK_SIZE)
                    
                # there are a lot of unique chunk path issues
                try:
                    DChunk.objects.bulk_create(d_chunk_list)
                except IntegrityError as e:
                    # This can't happen, because chunk_path does has unique=False at the time of the
                    # migration, and only has unique=True set later in a separate Django migration.
                    if "UNIQUE" in e.message:
                        for d_chunk in d_chunk_list:
                            if DChunk.objects.filter(chunk_path=d_chunk.chunk_path).exists():
                                try:
                                    print("duplicate path:",)
                                    duplicate_chunk_path_severity(d_chunk.chunk_path)
                                    print("...nevermind.")
                                except Exception as e2:
                                    print(d_chunk.chunk_path)
                                    print(e2.message)
                                    # raise e2
                            else:
                                d_chunk.save()
                    else:
                        raise e
                finally:
                    d_chunk_list = []
    
    DChunk.objects.bulk_create(d_chunk_list)