Esempio n. 1
0
def process_one_file(
        file_for_processing: FileForProcessing, survey_id_dict: dict, all_binified_data: DefaultDict,
        ftps_to_remove: set
):
    """ This function is the inner loop of the chunking process. """

    if file_for_processing.exception:
        file_for_processing.raise_data_processing_error()

    # there are two cases: chunkable data that can be stuck into "time bins" for each hour, and
    # files that do not need to be "binified" and pretty much just go into the ChunkRegistry unmodified.
    if file_for_processing.chunkable:
        newly_binified_data, survey_id_hash = process_csv_data(file_for_processing)

        # survey answers store the survey id in the file name (truly ancient design decision).
        if file_for_processing.data_type in SURVEY_DATA_FILES:
            survey_id_dict[survey_id_hash] = resolve_survey_id_from_file_name(
                file_for_processing.file_to_process.s3_file_path)

        if newly_binified_data:
            append_binified_csvs(
                all_binified_data, newly_binified_data, file_for_processing.file_to_process
            )
        else:  # delete empty files from FilesToProcess
            ftps_to_remove.add(file_for_processing.file_to_process.id)
        return

    else:
        # case: unchunkable data file
        timestamp = clean_java_timecode(
            file_for_processing.file_to_process.s3_file_path.rsplit("/", 1)[-1][:-4]
        )
        # Since we aren't binning the data by hour, just create a ChunkRegistry that
        # points to the already existing S3 file.
        try:
            ChunkRegistry.register_unchunked_data(
                file_for_processing.data_type,
                timestamp,
                file_for_processing.file_to_process.s3_file_path,
                file_for_processing.file_to_process.study.pk,
                file_for_processing.file_to_process.participant.pk,
                file_for_processing.file_contents,
            )
            ftps_to_remove.add(file_for_processing.file_to_process.id)
        except ValidationError as ve:
            if len(ve.messages) != 1:
                # case: the error case (below) is very specific, we only want that singular error.
                raise

            # case: an unchunkable file was re-uploaded, causing a duplicate file path collision
            # we detect this specific case and update the registry with the new file size
            # (hopefully it doesn't actually change)
            if 'Chunk registry with this Chunk path already exists.' in ve.messages:
                ChunkRegistry.update_registered_unchunked_data(
                    file_for_processing.data_type,
                    file_for_processing.file_to_process.s3_file_path,
                    file_for_processing.file_contents,
                )
                ftps_to_remove.add(file_for_processing.file_to_process.id)
            else:
                # any other errors, add
                raise
def do_process_user_file_chunks(count: int, error_handler: ErrorHandler, skip_count: int,
                                participant: Participant):
    """
    Run through the files to process, pull their data, put it into s3 bins. Run the file through
    the appropriate logic path based on file type.

    If a file is empty put its ftp object to the empty_files_list, we can't delete objects
    in-place while iterating over the db.

    All files except for the audio recording files are in the form of CSVs, most of those files
    can be separated by "time bin" (separated into one-hour chunks) and concatenated and sorted
    trivially. A few files, call log, identifier file, and wifi log, require some triage
    beforehand.  The debug log cannot be correctly sorted by time for all elements, because it
    was not actually expected to be used by researchers, but is apparently quite useful.

    Any errors are themselves concatenated using the passed in error handler.

    In a single call to this function, count files will be processed, starting from file number
    skip_count. The first skip_count files are expected to be files that have previously errored
    in file processing.
    """
    # Declare a defaultdict containing a tuple of two double ended queues (deque, pronounced "deck")
    all_binified_data = defaultdict(lambda: (deque(), deque()))
    ftps_to_remove = set()
    # The ThreadPool enables downloading multiple files simultaneously from the network, and continuing
    # to download files as other files are being processed, making the code as a whole run faster.
    pool = ThreadPool(CONCURRENT_NETWORK_OPS)
    survey_id_dict = {}

    # A Django query with a slice (e.g. .all()[x:y]) makes a LIMIT query, so it
    # only gets from the database those FTPs that are in the slice.
    # print(participant.as_unpacked_native_python())
    print(len(participant.files_to_process.exclude(deleted=True).all()))
    print(count)
    print(skip_count)

    files_to_process = participant.files_to_process.exclude(deleted=True).all()

    for data in pool.map(batch_retrieve_for_processing,
                         files_to_process[skip_count:count+skip_count],
                         chunksize=1):
        with error_handler:
            if data['exception']:
                raise_data_processing_error(data)

            if data['chunkable']:
                # case: chunkable data files
                newly_binified_data, survey_id_hash = process_csv_data(data)
                if data['data_type'] in SURVEY_DATA_FILES:
                    survey_id_dict[survey_id_hash] = resolve_survey_id_from_file_name(data['ftp']["s3_file_path"])

                if newly_binified_data:
                    append_binified_csvs(all_binified_data, newly_binified_data, data['ftp'])
                else:  # delete empty files from FilesToProcess
                    ftps_to_remove.add(data['ftp']['id'])
                continue
            else:
                # case: unchunkable data file
                timestamp = clean_java_timecode(data['ftp']["s3_file_path"].rsplit("/", 1)[-1][:-4])
                # Since we aren't binning the data by hour, just create a ChunkRegistry that
                # points to the already existing S3 file.
                try:
                    ChunkRegistry.register_unchunked_data(
                        data['data_type'],
                        timestamp,
                        data['ftp']['s3_file_path'],
                        data['ftp']['study'].pk,
                        data['ftp']['participant'].pk,
                        data['file_contents'],
                    )
                    ftps_to_remove.add(data['ftp']['id'])
                except ValidationError as ve:
                    if len(ve.messages) != 1:
                        # case: the error case (below) is very specific, we only want that singular error.
                        raise

                    # case: an unchunkable file was re-uploaded, causing a duplicate file path collision
                    # we detect this specific case and update the registry with the new file size
                    # (hopefully it doesn't actually change)
                    if 'Chunk registry with this Chunk path already exists.' in ve.messages:
                        ChunkRegistry.update_registered_unchunked_data(
                            data['data_type'],
                            data['ftp']['s3_file_path'],
                            data['file_contents'],
                        )
                        ftps_to_remove.add(data['ftp']['id'])
                    else:
                        # any other errors, add
                        raise

    pool.close()
    pool.terminate()
    more_ftps_to_remove, number_bad_files = upload_binified_data(all_binified_data, error_handler, survey_id_dict)
    ftps_to_remove.update(more_ftps_to_remove)
    # Actually delete the processed FTPs from the database
    FileToProcess.objects.filter(pk__in=ftps_to_remove).delete()
    # Garbage collect to free up memory
    gc.collect()
    return number_bad_files