Ejemplo n.º 1
0
def process_one_file(
        file_for_processing: FileForProcessing, survey_id_dict: dict, all_binified_data: DefaultDict,
        ftps_to_remove: set
):
    """ This function is the inner loop of the chunking process. """

    if file_for_processing.exception:
        file_for_processing.raise_data_processing_error()

    # there are two cases: chunkable data that can be stuck into "time bins" for each hour, and
    # files that do not need to be "binified" and pretty much just go into the ChunkRegistry unmodified.
    if file_for_processing.chunkable:
        newly_binified_data, survey_id_hash = process_csv_data(file_for_processing)

        # survey answers store the survey id in the file name (truly ancient design decision).
        if file_for_processing.data_type in SURVEY_DATA_FILES:
            survey_id_dict[survey_id_hash] = resolve_survey_id_from_file_name(
                file_for_processing.file_to_process.s3_file_path)

        if newly_binified_data:
            append_binified_csvs(
                all_binified_data, newly_binified_data, file_for_processing.file_to_process
            )
        else:  # delete empty files from FilesToProcess
            ftps_to_remove.add(file_for_processing.file_to_process.id)
        return

    else:
        # case: unchunkable data file
        timestamp = clean_java_timecode(
            file_for_processing.file_to_process.s3_file_path.rsplit("/", 1)[-1][:-4]
        )
        # Since we aren't binning the data by hour, just create a ChunkRegistry that
        # points to the already existing S3 file.
        try:
            ChunkRegistry.register_unchunked_data(
                file_for_processing.data_type,
                timestamp,
                file_for_processing.file_to_process.s3_file_path,
                file_for_processing.file_to_process.study.pk,
                file_for_processing.file_to_process.participant.pk,
                file_for_processing.file_contents,
            )
            ftps_to_remove.add(file_for_processing.file_to_process.id)
        except ValidationError as ve:
            if len(ve.messages) != 1:
                # case: the error case (below) is very specific, we only want that singular error.
                raise

            # case: an unchunkable file was re-uploaded, causing a duplicate file path collision
            # we detect this specific case and update the registry with the new file size
            # (hopefully it doesn't actually change)
            if 'Chunk registry with this Chunk path already exists.' in ve.messages:
                ChunkRegistry.update_registered_unchunked_data(
                    file_for_processing.data_type,
                    file_for_processing.file_to_process.s3_file_path,
                    file_for_processing.file_contents,
                )
                ftps_to_remove.add(file_for_processing.file_to_process.id)
            else:
                # any other errors, add
                raise
Ejemplo n.º 2
0
def chunk_file_lambda_handler(event, context):

    error_handler = ErrorHandler()

    # Declare a defaultdict containing a tuple of two double ended queues (deque, pronounced "deck")
    all_binified_data = defaultdict(lambda: (deque(), deque()))
    ftps_to_remove = set()
    # The ThreadPool enables downloading multiple files simultaneously from the network, and continuing
    # to download files as other files are being processed, making the code as a whole run faster.
    survey_id_dict = {}

    for record in event['Records']:
        full_s3_path = record['s3']['object']['key']
        key_values = full_s3_path.split('/')

        # there was a bug that would incorrectly preprend the s3_path with the study_object_id
        # lets try to undo that here
        if 'RAW_DATA' not in key_values[0]:

            key_values = key_values[key_values.index('RAW_DATA'):]

            #logger.error('S3 path {0} does not appear to be in RAW_DATA'.format(full_s3_path))
            #return {
            #'statusCode': 200,
            #'body': json.dumps('False positive!')
            #}

        study_object_id = key_values[1]
        participant_id = key_values[2]

        try:
            participant = Participant.objects.get(patient_id=participant_id)
        except Participant.DoesNotExist as e:
            logger.error(
                'Could not find participant {0} for file to process {1}'.
                format(participant_id, full_s3_path))
            return {'statusCode': 200, 'body': json.dumps('Lambda failed!')}

        # an file with no extension means we may need to create RSA keys for this participant
        # lets investigate!
        _, file_extension = os.path.splitext(full_s3_path)
        if not file_extension:

            # first check to see if a key pair already exists
            key_paths = construct_s3_key_paths(study_object_id, participant_id)
            logger.info(
                'Look to see if keys already exist at: {}'.format(key_paths))

            if s3_exists(key_paths['private'], study_object_id, raw_path=True) and \
               s3_exists(key_paths['public'], study_object_id, raw_path=True):

                logger.error('Key pair already exists for {0}: {1}'.format(
                    study_object_id, participant_id))

                return {
                    'statusCode':
                    200,
                    'body':
                    json.dumps('Key pair already exists for {0}: {1}'.format(
                        study_object_id, participant_id))
                }

            else:
                logger.info('Generating key pair for {0}: {1}'.format(
                    study_object_id, participant_id))
                create_client_key_pair(participant_id, study_object_id)

                return {
                    'statusCode':
                    200,
                    'body':
                    json.dumps('Created key pair for {0}: {1}'.format(
                        study_object_id, participant_id))
                }

        else:

            try:
                file_to_process = participant.files_to_process.get(
                    s3_file_path=full_s3_path)
            except FileToProcess.MultipleObjectsReturned as e:
                # sometimes there are multiple entries on files_to_process with the same s3 path,
                # i am not sure why this happens, but i think that it could be OK just to
                # take the first and delete the others
                the_first_file = True
                for fp in participant.files_to_process.filter(
                        s3_file_path=full_s3_path):
                    if the_first_file == True:
                        file_to_process = fp
                        the_first_file = False
                    else:
                        ftps_to_remove.add(fp.id)
            except FileToProcess.DoesNotExist as e:
                logger.error(
                    'Could not find file to process {0} for participant {1}'.
                    format(full_s3_path, participant_id))
                return {
                    'statusCode': 200,
                    'body': json.dumps('Lambda failed!')
                }

            # some paths were corrupted by prepending the study object id to the correct path, remove this
            if 'RAW_DATA' not in file_to_process.s3_file_path[0:8]:
                path_vals = file_to_process.s3_file_path.split('/')
                file_to_process.s3_file_path = '/'.join(
                    path_vals[path_vals.index('RAW_DATA'):])
                file_to_process.save()

            data = batch_retrieve_for_processing(file_to_process)

            #with error_handler:
            # If we encountered any errors in retrieving the files for processing, they have been
            # lumped together into data['exception']. Raise them here to the error handler and
            # move to the next file.
            if data['exception']:
                logger.error("\n" + data['ftp']['s3_file_path'])
                logger.error(data['traceback'])
                ################################################################
                # YOU ARE SEEING THIS EXCEPTION WITHOUT A STACK TRACE
                # BECAUSE IT OCCURRED INSIDE POOL.MAP, ON ANOTHER THREAD
                ################################################################
                raise data['exception']

            if data['chunkable']:
                # print "1a"
                newly_binified_data, survey_id_hash = process_csv_data(data)
                # print data, "\n1b"
                if data['data_type'] in SURVEY_DATA_FILES:
                    # print survey_id_hash
                    survey_id_dict[
                        survey_id_hash] = resolve_survey_id_from_file_name(
                            data['ftp']["s3_file_path"])

                if newly_binified_data:
                    # print "1c"
                    append_binified_csvs(all_binified_data,
                                         newly_binified_data, data['ftp'])
                else:  # delete empty files from FilesToProcess
                    # print "1d"
                    ftps_to_remove.add(data['ftp']['id'])

            else:  # if not data['chunkable']
                # print "2a"
                timestamp = clean_java_timecode(
                    data['ftp']["s3_file_path"].rsplit("/", 1)[-1][:-4])

                chunked_file_path = construct_s3_chunk_path_from_raw_data_path(
                    data['ftp']['s3_file_path'])

                try:
                    s3_move(data['ftp']['s3_file_path'],
                            chunked_file_path,
                            data['ftp']['study'].object_id,
                            raw_path=True)

                    # print "2a"
                    # Since we aren't binning the data by hour, just create a ChunkRegistry that
                    # points to the already existing S3 file.
                    ChunkRegistry.register_unchunked_data(
                        data['data_type'],
                        timestamp,
                        chunked_file_path,
                        data['ftp']['study'].pk,
                        data['ftp']['participant'].pk,
                    )
                    # print "2b"
                except:
                    print(
                        "Could not find s3 file {0}, removing it as a file to process"
                        .format(data['ftp']['s3_file_path']))

                ftps_to_remove.add(data['ftp']['id'])

    #print(newly_binified_data)
    # print 3
    more_ftps_to_remove, number_bad_files = upload_binified_data(
        all_binified_data, error_handler, survey_id_dict)
    # print "X"
    ftps_to_remove.update(more_ftps_to_remove)
    # Actually delete the processed FTPs from the database
    FileToProcess.objects.filter(pk__in=ftps_to_remove).delete()
    # print "Y"

    # delete the file that triggered the labmda
    #s3_delete(full_s3_path,'',raw_path=True)

    # Garbage collect to free up memory
    gc.collect()
    # print "Z"
    return {'statusCode': 200, 'body': json.dumps('Hello from Lambda!')}
Ejemplo n.º 3
0
def do_process_user_file_chunks(count: int, error_handler: ErrorHandler,
                                skip_count: int, participant: Participant):
    """
    Run through the files to process, pull their data, put it into s3 bins. Run the file through
    the appropriate logic path based on file type.

    If a file is empty put its ftp object to the empty_files_list, we can't delete objects
    in-place while iterating over the db.

    All files except for the audio recording files are in the form of CSVs, most of those files
    can be separated by "time bin" (separated into one-hour chunks) and concatenated and sorted
    trivially. A few files, call log, identifier file, and wifi log, require some triage
    beforehand.  The debug log cannot be correctly sorted by time for all elements, because it
    was not actually expected to be used by researchers, but is apparently quite useful.

    Any errors are themselves concatenated using the passed in error handler.

    In a single call to this function, count files will be processed, starting from file number
    skip_count. The first skip_count files are expected to be files that have previously errored
    in file processing.
    """
    # Declare a defaultdict containing a tuple of two double ended queues (deque, pronounced "deck")
    all_binified_data = defaultdict(lambda: (deque(), deque()))
    ftps_to_remove = set()
    # The ThreadPool enables downloading multiple files simultaneously from the network, and continuing
    # to download files as other files are being processed, making the code as a whole run faster.
    pool = ThreadPool(CONCURRENT_NETWORK_OPS)
    survey_id_dict = {}

    # A Django query with a slice (e.g. .all()[x:y]) makes a LIMIT query, so it
    # only gets from the database those FTPs that are in the slice.
    print(participant.as_native_python())
    print(len(participant.files_to_process.exclude(deleted=True).all()))
    print(count)
    print(skip_count)

    files_to_process = participant.files_to_process.exclude(deleted=True).all()

    for data in pool.map(batch_retrieve_for_processing,
                         files_to_process[skip_count:count + skip_count],
                         chunksize=1):
        with error_handler:
            # If we encountered any errors in retrieving the files for processing, they have been
            # lumped together into data['exception']. Raise them here to the error handler and
            # move to the next file.
            if data['exception']:
                print("\n" + data['ftp']['s3_file_path'])
                print(data['traceback'])
                ################################################################
                # YOU ARE SEEING THIS EXCEPTION WITHOUT A STACK TRACE
                # BECAUSE IT OCCURRED INSIDE POOL.MAP ON ANOTHER THREAD
                ################################################################
                raise data['exception']

            if data['chunkable']:
                newly_binified_data, survey_id_hash = process_csv_data(data)
                if data['data_type'] in SURVEY_DATA_FILES:
                    survey_id_dict[
                        survey_id_hash] = resolve_survey_id_from_file_name(
                            data['ftp']["s3_file_path"])

                if newly_binified_data:
                    append_binified_csvs(all_binified_data,
                                         newly_binified_data, data['ftp'])
                else:  # delete empty files from FilesToProcess
                    ftps_to_remove.add(data['ftp']['id'])
                continue

            # if not data['chunkable']
            else:
                timestamp = clean_java_timecode(
                    data['ftp']["s3_file_path"].rsplit("/", 1)[-1][:-4])
                # Since we aren't binning the data by hour, just create a ChunkRegistry that
                # points to the already existing S3 file.
                ChunkRegistry.register_unchunked_data(
                    data['data_type'],
                    timestamp,
                    data['ftp']['s3_file_path'],
                    data['ftp']['study'].pk,
                    data['ftp']['participant'].pk,
                    data['file_contents'],
                )
                ftps_to_remove.add(data['ftp']['id'])

    pool.close()
    pool.terminate()
    more_ftps_to_remove, number_bad_files = upload_binified_data(
        all_binified_data, error_handler, survey_id_dict)
    ftps_to_remove.update(more_ftps_to_remove)
    # Actually delete the processed FTPs from the database
    FileToProcess.objects.filter(pk__in=ftps_to_remove).delete()
    # Garbage collect to free up memory
    gc.collect()
    return number_bad_files
Ejemplo n.º 4
0
def do_process_user_file_chunks(count: int, error_handler: ErrorHandler, skip_count: int,
                                participant: Participant):
    """
    Run through the files to process, pull their data, put it into s3 bins. Run the file through
    the appropriate logic path based on file type.

    If a file is empty put its ftp object to the empty_files_list, we can't delete objects
    in-place while iterating over the db.

    All files except for the audio recording files are in the form of CSVs, most of those files
    can be separated by "time bin" (separated into one-hour chunks) and concatenated and sorted
    trivially. A few files, call log, identifier file, and wifi log, require some triage
    beforehand.  The debug log cannot be correctly sorted by time for all elements, because it
    was not actually expected to be used by researchers, but is apparently quite useful.

    Any errors are themselves concatenated using the passed in error handler.

    In a single call to this function, count files will be processed, starting from file number
    skip_count. The first skip_count files are expected to be files that have previously errored
    in file processing.
    """
    # Declare a defaultdict containing a tuple of two double ended queues (deque, pronounced "deck")
    all_binified_data = defaultdict(lambda: (deque(), deque()))
    ftps_to_remove = set()
    # The ThreadPool enables downloading multiple files simultaneously from the network, and continuing
    # to download files as other files are being processed, making the code as a whole run faster.
    pool = ThreadPool(CONCURRENT_NETWORK_OPS)
    survey_id_dict = {}

    # A Django query with a slice (e.g. .all()[x:y]) makes a LIMIT query, so it
    # only gets from the database those FTPs that are in the slice.
    # print(participant.as_unpacked_native_python())
    print(len(participant.files_to_process.exclude(deleted=True).all()))
    print(count)
    print(skip_count)

    files_to_process = participant.files_to_process.exclude(deleted=True).all()

    for data in pool.map(batch_retrieve_for_processing,
                         files_to_process[skip_count:count+skip_count],
                         chunksize=1):
        with error_handler:
            if data['exception']:
                raise_data_processing_error(data)

            if data['chunkable']:
                # case: chunkable data files
                newly_binified_data, survey_id_hash = process_csv_data(data)
                if data['data_type'] in SURVEY_DATA_FILES:
                    survey_id_dict[survey_id_hash] = resolve_survey_id_from_file_name(data['ftp']["s3_file_path"])

                if newly_binified_data:
                    append_binified_csvs(all_binified_data, newly_binified_data, data['ftp'])
                else:  # delete empty files from FilesToProcess
                    ftps_to_remove.add(data['ftp']['id'])
                continue
            else:
                # case: unchunkable data file
                timestamp = clean_java_timecode(data['ftp']["s3_file_path"].rsplit("/", 1)[-1][:-4])
                # Since we aren't binning the data by hour, just create a ChunkRegistry that
                # points to the already existing S3 file.
                try:
                    ChunkRegistry.register_unchunked_data(
                        data['data_type'],
                        timestamp,
                        data['ftp']['s3_file_path'],
                        data['ftp']['study'].pk,
                        data['ftp']['participant'].pk,
                        data['file_contents'],
                    )
                    ftps_to_remove.add(data['ftp']['id'])
                except ValidationError as ve:
                    if len(ve.messages) != 1:
                        # case: the error case (below) is very specific, we only want that singular error.
                        raise

                    # case: an unchunkable file was re-uploaded, causing a duplicate file path collision
                    # we detect this specific case and update the registry with the new file size
                    # (hopefully it doesn't actually change)
                    if 'Chunk registry with this Chunk path already exists.' in ve.messages:
                        ChunkRegistry.update_registered_unchunked_data(
                            data['data_type'],
                            data['ftp']['s3_file_path'],
                            data['file_contents'],
                        )
                        ftps_to_remove.add(data['ftp']['id'])
                    else:
                        # any other errors, add
                        raise

    pool.close()
    pool.terminate()
    more_ftps_to_remove, number_bad_files = upload_binified_data(all_binified_data, error_handler, survey_id_dict)
    ftps_to_remove.update(more_ftps_to_remove)
    # Actually delete the processed FTPs from the database
    FileToProcess.objects.filter(pk__in=ftps_to_remove).delete()
    # Garbage collect to free up memory
    gc.collect()
    return number_bad_files