Example #1
0
def process_file_chunks():
    """
    This is the function that is called from the command line.  It runs through all new files
    that have been uploaded and 'chunks' them. Handles logic for skipping bad files, raising
    errors appropriately.
    This is primarily called manually during testing and debugging.
    """
    # Initialize the process and ensure there is no other process running at the same time
    error_handler = ErrorHandler()
    if FileProcessLock.islocked():
        raise ProcessingOverlapError(
            "Data processing overlapped with a previous data indexing run.")
    FileProcessLock.lock()

    try:
        number_bad_files = 0

        # Get the list of participants with open files to process
        participants = Participant.objects.filter(
            files_to_process__isnull=False).distinct()
        print("processing files for the following users: %s" %
              ",".join(participants.values_list('patient_id', flat=True)))

        for participant in participants:
            while True:
                previous_number_bad_files = number_bad_files
                starting_length = participant.files_to_process.exclude(
                    deleted=True).count()

                print(
                    "%s processing %s, %s files remaining" %
                    (datetime.now(), participant.patient_id, starting_length))

                # Process the desired number of files and calculate the number of unprocessed files
                number_bad_files += do_process_user_file_chunks(
                    count=FILE_PROCESS_PAGE_SIZE,
                    error_handler=error_handler,
                    skip_count=number_bad_files,
                    participant=participant,
                )

                # If no files were processed, quit processing
                if (participant.files_to_process.exclude(deleted=True).count()
                        == starting_length
                        and previous_number_bad_files == number_bad_files):
                    # Cases:
                    #   every file broke, might as well fail here, and would cause infinite loop otherwise.
                    #   no new files.
                    break
    finally:
        FileProcessLock.unlock()

    error_handler.raise_errors()
    raise EverythingWentFine(DATA_PROCESSING_NO_ERROR_STRING)
def process_file_chunks_lambda():
    """
    This is the function that is called from the command line.  It runs through all new files
    that have been uploaded and 'chunks' them. Handles logic for skipping bad files, raising
    errors appropriately.
    This is primarily called manually during testing and debugging.
    """
    # Initialize the process and ensure there is no other process running at the same time
    error_handler = ErrorHandler()
    if FileProcessLock.islocked():
        raise ProcessingOverlapError(
            "Data processing overlapped with a previous data indexing run.")
    FileProcessLock.lock()

    try:
        number_bad_files = 0

        # Get the list of participants with open files to process
        participants = Participant.objects.filter(
            files_to_process__isnull=False).distinct()
        print("processing files for the following users: %s" %
              ",".join(participants.values_list('patient_id', flat=True)))

        for participant in participants:
            for fp in participant.files_to_process.all():
                print(fp.s3_file_path)
                event = {
                    'Records': [{
                        's3': {
                            'object': {
                                'key': fp.s3_file_path
                            }
                        }
                    }]
                }

                chunk_file_lambda_handler(event, [])

    finally:
        FileProcessLock.unlock()

    error_handler.raise_errors()
    raise EverythingWentFine(DATA_PROCESSING_NO_ERROR_STRING)
def process_file_chunks():
    """ This is the function that is called from cron.  It runs through all new files that have
    been uploaded and 'chunks' them. Handles logic for skipping bad files, raising errors
    appropriately. """
    error_handler = ErrorHandler()
    if FileProcessLock.islocked():
        raise ProcessingOverlapError(
            "Data processing overlapped with a previous data indexing run.")
    FileProcessLock.lock()
    number_bad_files = 0

    user_ids = set(FilesToProcess(field="user_id"))
    print "processing files for the following users: %s" % ",".join(user_ids)
    for user_id in user_ids:
        while True:
            previous_number_bad_files = number_bad_files
            starting_length = FilesToProcess.count(user_id=user_id)

            print str(datetime.now()), "processing %s, %s files remaining" % (
                user_id, starting_length)

            number_bad_files += do_process_user_file_chunks(
                count=FILE_PROCESS_PAGE_SIZE,
                error_handler=error_handler,
                skip_count=number_bad_files,
                user_id=user_id)

            if starting_length == FilesToProcess.count(
                    user_id=user_id):  #zero files processed
                if previous_number_bad_files == number_bad_files:
                    # Cases:
                    #   every file broke, might as well fail here, and would cause infinite loop otherwise.
                    #   no new files.
                    break
                else:
                    continue
    FileProcessLock.unlock()
    error_handler.raise_errors()
    raise EverythingWentFine(DATA_PROCESSING_NO_ERROR_STRING)
Example #4
0
def make_error_sentry(sentry_type, tags=None, null=False):
    """ Creates an ErrorSentry, defaults to error limit 10.
    If the applicable sentry DSN is missing will return an ErrorSentry,
    but if null truthy a NullErrorHandler will be returned instead. """

    dsn = get_dsn_from_string(sentry_type)
    tags = tags or {}

    try:
        return ErrorSentry(
            dsn,
            sentry_client_kwargs={'tags': tags, 'transport': HTTPTransport},
            sentry_report_limit=10
        )
    except InvalidDsn as e:
        log_error(e)
        if null:
            return NullErrorHandler()
        else:
            return ErrorHandler()
def chunk_file_lambda_handler(event, context):

    error_handler = ErrorHandler()

    # Declare a defaultdict containing a tuple of two double ended queues (deque, pronounced "deck")
    all_binified_data = defaultdict(lambda: (deque(), deque()))
    ftps_to_remove = set()
    # The ThreadPool enables downloading multiple files simultaneously from the network, and continuing
    # to download files as other files are being processed, making the code as a whole run faster.
    survey_id_dict = {}

    for record in event['Records']:
        full_s3_path = record['s3']['object']['key']
        key_values = full_s3_path.split('/')

        # there was a bug that would incorrectly preprend the s3_path with the study_object_id
        # lets try to undo that here
        if 'RAW_DATA' not in key_values[0]:

            key_values = key_values[key_values.index('RAW_DATA'):]

            #logger.error('S3 path {0} does not appear to be in RAW_DATA'.format(full_s3_path))
            #return {
            #'statusCode': 200,
            #'body': json.dumps('False positive!')
            #}

        study_object_id = key_values[1]
        participant_id = key_values[2]

        try:
            participant = Participant.objects.get(patient_id=participant_id)
        except Participant.DoesNotExist as e:
            logger.error(
                'Could not find participant {0} for file to process {1}'.
                format(participant_id, full_s3_path))
            return {'statusCode': 200, 'body': json.dumps('Lambda failed!')}

        # an file with no extension means we may need to create RSA keys for this participant
        # lets investigate!
        _, file_extension = os.path.splitext(full_s3_path)
        if not file_extension:

            # first check to see if a key pair already exists
            key_paths = construct_s3_key_paths(study_object_id, participant_id)
            logger.info(
                'Look to see if keys already exist at: {}'.format(key_paths))

            if s3_exists(key_paths['private'], study_object_id, raw_path=True) and \
               s3_exists(key_paths['public'], study_object_id, raw_path=True):

                logger.error('Key pair already exists for {0}: {1}'.format(
                    study_object_id, participant_id))

                return {
                    'statusCode':
                    200,
                    'body':
                    json.dumps('Key pair already exists for {0}: {1}'.format(
                        study_object_id, participant_id))
                }

            else:
                logger.info('Generating key pair for {0}: {1}'.format(
                    study_object_id, participant_id))
                create_client_key_pair(participant_id, study_object_id)

                return {
                    'statusCode':
                    200,
                    'body':
                    json.dumps('Created key pair for {0}: {1}'.format(
                        study_object_id, participant_id))
                }

        else:

            try:
                file_to_process = participant.files_to_process.get(
                    s3_file_path=full_s3_path)
            except FileToProcess.MultipleObjectsReturned as e:
                # sometimes there are multiple entries on files_to_process with the same s3 path,
                # i am not sure why this happens, but i think that it could be OK just to
                # take the first and delete the others
                the_first_file = True
                for fp in participant.files_to_process.filter(
                        s3_file_path=full_s3_path):
                    if the_first_file == True:
                        file_to_process = fp
                        the_first_file = False
                    else:
                        ftps_to_remove.add(fp.id)
            except FileToProcess.DoesNotExist as e:
                logger.error(
                    'Could not find file to process {0} for participant {1}'.
                    format(full_s3_path, participant_id))
                return {
                    'statusCode': 200,
                    'body': json.dumps('Lambda failed!')
                }

            # some paths were corrupted by prepending the study object id to the correct path, remove this
            if 'RAW_DATA' not in file_to_process.s3_file_path[0:8]:
                path_vals = file_to_process.s3_file_path.split('/')
                file_to_process.s3_file_path = '/'.join(
                    path_vals[path_vals.index('RAW_DATA'):])
                file_to_process.save()

            data = batch_retrieve_for_processing(file_to_process)

            #with error_handler:
            # If we encountered any errors in retrieving the files for processing, they have been
            # lumped together into data['exception']. Raise them here to the error handler and
            # move to the next file.
            if data['exception']:
                logger.error("\n" + data['ftp']['s3_file_path'])
                logger.error(data['traceback'])
                ################################################################
                # YOU ARE SEEING THIS EXCEPTION WITHOUT A STACK TRACE
                # BECAUSE IT OCCURRED INSIDE POOL.MAP, ON ANOTHER THREAD
                ################################################################
                raise data['exception']

            if data['chunkable']:
                # print "1a"
                newly_binified_data, survey_id_hash = process_csv_data(data)
                # print data, "\n1b"
                if data['data_type'] in SURVEY_DATA_FILES:
                    # print survey_id_hash
                    survey_id_dict[
                        survey_id_hash] = resolve_survey_id_from_file_name(
                            data['ftp']["s3_file_path"])

                if newly_binified_data:
                    # print "1c"
                    append_binified_csvs(all_binified_data,
                                         newly_binified_data, data['ftp'])
                else:  # delete empty files from FilesToProcess
                    # print "1d"
                    ftps_to_remove.add(data['ftp']['id'])

            else:  # if not data['chunkable']
                # print "2a"
                timestamp = clean_java_timecode(
                    data['ftp']["s3_file_path"].rsplit("/", 1)[-1][:-4])

                chunked_file_path = construct_s3_chunk_path_from_raw_data_path(
                    data['ftp']['s3_file_path'])

                try:
                    s3_move(data['ftp']['s3_file_path'],
                            chunked_file_path,
                            data['ftp']['study'].object_id,
                            raw_path=True)

                    # print "2a"
                    # Since we aren't binning the data by hour, just create a ChunkRegistry that
                    # points to the already existing S3 file.
                    ChunkRegistry.register_unchunked_data(
                        data['data_type'],
                        timestamp,
                        chunked_file_path,
                        data['ftp']['study'].pk,
                        data['ftp']['participant'].pk,
                    )
                    # print "2b"
                except:
                    print(
                        "Could not find s3 file {0}, removing it as a file to process"
                        .format(data['ftp']['s3_file_path']))

                ftps_to_remove.add(data['ftp']['id'])

    #print(newly_binified_data)
    # print 3
    more_ftps_to_remove, number_bad_files = upload_binified_data(
        all_binified_data, error_handler, survey_id_dict)
    # print "X"
    ftps_to_remove.update(more_ftps_to_remove)
    # Actually delete the processed FTPs from the database
    FileToProcess.objects.filter(pk__in=ftps_to_remove).delete()
    # print "Y"

    # delete the file that triggered the labmda
    #s3_delete(full_s3_path,'',raw_path=True)

    # Garbage collect to free up memory
    gc.collect()
    # print "Z"
    return {'statusCode': 200, 'body': json.dumps('Hello from Lambda!')}