def reindex_all_files_to_process(): """ Totally clears the FilesToProcess DB, deletes all chunked files on S3, clears the ChunksRegistry DB, reads all relevant files on S3 to the FilesToProcess registry and then re-chunks them. """ raise Exception( "This code has not been tested since converting database backends, that means 2018" ) # Delete all preexisting FTP and ChunkRegistry objects FileProcessLock.lock() print('{!s} purging FileToProcess: {:d}'.format( datetime.now(), FileToProcess.objects.count())) FileToProcess.objects.all().delete() print('{!s} purging ChunkRegistry: {:d}'.format( datetime.now(), ChunkRegistry.objects.count())) ChunkRegistry.objects.all().delete() pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2) # Delete all preexisting chunked data files CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER) print('{!s} deleting older chunked data: {:d}'.format( datetime.now(), len(CHUNKED_DATA))) pool.map(s3_delete, CHUNKED_DATA) del CHUNKED_DATA # Get a list of all S3 files to replace in the database print('{!s} pulling new files to process...'.format(datetime.now())) files_lists = pool.map(s3_list_files, Study.objects.values_list('object_id', flat=True)) # For each such file, create an FTP object print("putting new files to process...") for i, l in enumerate(files_lists): print('{!s} {:d} of {:d}, {:d} files'.format(datetime.now(), i + 1, Study.objects.count(), len(l))) for fp in l: if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS: patient_id = fp.split('/', 2)[1] participant_pk = Participant.objects.filter( patient_id=patient_id).values_list('pk', flat=True).get() FileToProcess.append_file_for_processing( fp, fp.split("/", 1)[0], participant_id=participant_pk) # Clean up by deleting large variables, closing the thread pool and unlocking the file process lock del files_lists, l pool.close() pool.terminate() FileProcessLock.unlock() # Rechunk the newly created FTPs print("{!s} processing data.".format(datetime.now())) process_file_chunks()
def process_file_chunks(): """ This is the function that is called from the command line. It runs through all new files that have been uploaded and 'chunks' them. Handles logic for skipping bad files, raising errors appropriately. This is primarily called manually during testing and debugging. """ # Initialize the process and ensure there is no other process running at the same time error_handler = ErrorHandler() if FileProcessLock.islocked(): raise ProcessingOverlapError( "Data processing overlapped with a previous data indexing run.") FileProcessLock.lock() try: number_bad_files = 0 # Get the list of participants with open files to process participants = Participant.objects.filter( files_to_process__isnull=False).distinct() print("processing files for the following users: %s" % ",".join(participants.values_list('patient_id', flat=True))) for participant in participants: while True: previous_number_bad_files = number_bad_files starting_length = participant.files_to_process.exclude( deleted=True).count() print( "%s processing %s, %s files remaining" % (datetime.now(), participant.patient_id, starting_length)) # Process the desired number of files and calculate the number of unprocessed files number_bad_files += do_process_user_file_chunks( count=FILE_PROCESS_PAGE_SIZE, error_handler=error_handler, skip_count=number_bad_files, participant=participant, ) # If no files were processed, quit processing if (participant.files_to_process.exclude(deleted=True).count() == starting_length and previous_number_bad_files == number_bad_files): # Cases: # every file broke, might as well fail here, and would cause infinite loop otherwise. # no new files. break finally: FileProcessLock.unlock() error_handler.raise_errors() raise EverythingWentFine(DATA_PROCESSING_NO_ERROR_STRING)
def reindex_specific_data_type(data_type): raise Exception( "This code has not been tested since converting database backends") FileProcessLock.lock() print("starting...") # Convert the data type; raise an error if something is wrong with it file_name_key = data_stream_to_s3_file_name_string(data_type) # Get all chunk paths of the given data type relevant_chunks = ChunkRegistry.objects.filter(data_type=data_type) # list() ensures that the QuerySet is evaluated before all of its elements are deleted (otherwise it would be empty) relevant_indexed_files = list( relevant_chunks.values_list('chunk_path', flat=True)) # Delete the old ChunkRegistry objects print("purging old data...") relevant_chunks.delete() pool = ThreadPool(20) pool.map(s3_delete, relevant_indexed_files) print("pulling files to process...") files_lists = pool.map(s3_list_files, Study.objects.values_list('object_id', flat=True)) for i, l in enumerate(files_lists): print('{!s} {:d} of {:d}, {:d} files'.format(datetime.now(), i + 1, Study.objects.count(), len(l))) for fp in l: if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS: patient_id = fp.split('/', 2)[1] participant_pk = Participant.objects.filter( patient_id=patient_id).values_list('pk', flat=True).get() FileToProcess.append_file_for_processing( fp, fp.split("/", 1)[0], participant_id=participant_pk) del files_lists, l pool.close() pool.terminate() FileProcessLock.unlock() print("{!s} processing data.".format(datetime.now())) process_file_chunks() print("Done.")
def process_file_chunks_lambda(): """ This is the function that is called from the command line. It runs through all new files that have been uploaded and 'chunks' them. Handles logic for skipping bad files, raising errors appropriately. This is primarily called manually during testing and debugging. """ # Initialize the process and ensure there is no other process running at the same time error_handler = ErrorHandler() if FileProcessLock.islocked(): raise ProcessingOverlapError( "Data processing overlapped with a previous data indexing run.") FileProcessLock.lock() try: number_bad_files = 0 # Get the list of participants with open files to process participants = Participant.objects.filter( files_to_process__isnull=False).distinct() print("processing files for the following users: %s" % ",".join(participants.values_list('patient_id', flat=True))) for participant in participants: for fp in participant.files_to_process.all(): print(fp.s3_file_path) event = { 'Records': [{ 's3': { 'object': { 'key': fp.s3_file_path } } }] } chunk_file_lambda_handler(event, []) finally: FileProcessLock.unlock() error_handler.raise_errors() raise EverythingWentFine(DATA_PROCESSING_NO_ERROR_STRING)
def report_file_processing_locked_and_exit(): """ Creates a useful error report with information about the run time. """ timedelta_since_last_run = FileProcessLock.get_time_since_locked() print("timedelta %s" % timedelta_since_last_run.total_seconds()) if timedelta_since_last_run.total_seconds() > CELERY_ERROR_REPORT_TIMEOUT_SECONDS: error_msg = ( "Data processing has overlapped with a prior data index run that started more than " "%s minutes ago.\nThat prior run has been going for %s hour(s), %s minute(s)" ) error_msg = error_msg % (CELERY_ERROR_REPORT_TIMEOUT_SECONDS / 60, str(int(timedelta_since_last_run.total_seconds() / 60 / 60)), str(int(timedelta_since_last_run.total_seconds() / 60 % 60))) if timedelta_since_last_run.total_seconds() > CELERY_ERROR_REPORT_TIMEOUT_SECONDS * 4: error_msg = "DATA PROCESSING OVERLOADED, CHECK SERVER.\n" + error_msg email_system_administrators(error_msg, "DATA PROCESSING OVERLOADED, CHECK SERVER") raise ProcessingOverlapError(error_msg)
parser.add_argument('--delete_survey', help='Removes all of the surveys for a specified study.', nargs=1, type=str) parser.add_argument('--get_survey', help='Use the mobile API to retrieve all surveys for participant.', nargs=1, type=str) parser.add_argument('--create_participant_survey', help='Create a survey for a participant using the contents of a json file', nargs=2, type=str) parser.add_argument('--send_participant_message', help='Create a survey for a participant with a single informational text question that includes the given string', nargs=2, type=str) args = parser.parse_args() if args.unlock_fileprocessing_lock: FileProcessLock.unlock() print('Unlocked') if args.write_survey_config: try: study = Study.objects.get(pk=int(args.write_survey_config[0])) except Study.DoesNotExist: print("Could not find study {0}".format(args.write_survey_config[0])) raise survey_config = {} for survey in study.surveys.filter(deleted=False): if survey.deleted:
def create_file_processing_tasks(): # The entire code is wrapped in an ErrorSentry, which catches any errors # and sends them to Sentry. with make_error_sentry('data') as error_sentry: print(error_sentry.sentry_client.is_enabled()) if FileProcessLock.islocked(): # This is really a safety check to ensure that no code executes # if file processing is locked. report_file_processing_locked_and_exit() # report_file_processing_locked should raise an error; this should be unreachable exit(0) else: FileProcessLock.lock() print("starting.") now = datetime.now() expiry = now + timedelta(minutes=CELERY_EXPIRY_MINUTES) participant_set = Participant.objects.filter( files_to_process__isnull=False).distinct().values_list("id", flat=True) running = [] for participant_id in participant_set: # Queue all users' file processing, and generate a list of currently running jobs # to use to detect when all jobs are finished running. running.append( safe_queue_user(args=[participant_id], max_retries=0, expires=expiry, task_track_started=True, task_publish_retry=False, retry=False)) print("tasks:", running) # If there are any Celery tasks still running, check their state and update the running # list accordingly. Do this every 5 seconds. while running: new_running = [] failed = [] successful = [] for future in running: #################################################################################### # This variable can mutate on a separate thread. We need the value as it was at # this snapshot in time, so we store it. (The object is a string, passed by value.) #################################################################################### state = future.state if state == SUCCESS: successful.append(future) if state in FAILED: failed.append(future) if state in STARTED_OR_WAITING: new_running.append(future) running = new_running print("tasks:", running) if running: sleep(5) print("Finished, unlocking.") # The unlocking MUST be **inside** the with statement. FileProcessLock.unlock()