def assemble_deletable_files(sorted_data): deletable_file_paths = [] for patient_id, expunge_start_date in sorted_data: participant = Participant.objects.get(patient_id=patient_id) # technically it is a datetime object expunge_start_date = convert_date(expunge_start_date) expunge_start_unix_timestamp = int( (expunge_start_date - UNIX_EPOCH_START).total_seconds()) * 1000 prefix = str(participant.study.object_id) + "/" + patient_id + "/" s3_files = s3_list_files(prefix, as_generator=True) chunks_prefix = CHUNKS_FOLDER + "/" + prefix s3_chunks_files = s3_list_files(chunks_prefix, as_generator=True) raw_files = assemble_raw_files(s3_files, expunge_start_unix_timestamp) chunked_files = assemble_chunked_files(s3_chunks_files, expunge_start_date) print( patient_id, "timestamp: %s, (unixtime: %s): %s files" % (expunge_start_date, expunge_start_unix_timestamp / 1000, len(raw_files) + len(chunked_files))) deletable_file_paths.extend(raw_files) deletable_file_paths.extend(chunked_files) return deletable_file_paths
def do_upload(file_paths_and_contents, data_type=None, forcibly_overwrite=False): if data_type == None: raise Exception("DATA TYPE!") upload_stream_map = { "survey_answers":("surveyAnswers", "csv"), "audio":("voiceRecording", "mp4") } data_stream_string, file_extension = upload_stream_map[data_type] for timings_path, contents_and_timestamp in file_paths_and_contents.items(): contents, timestamp = contents_and_timestamp study_id_string, user_id, _, survey_id, _ = timings_path.split("/") try: timestamp_string = str( int( mktime( timestamp.timetuple( ) ) ) ) + "000" except AttributeError: print "PROBLEM WITH TIMESTAMP FROM: %s" % timings_path continue if len(timestamp_string) != 13: raise Exception("LOL! No.") study_obj_id = Study(ObjectId(study_id_string))._id s3_file_path = "%s/%s/%s/%s/%s.%s" % (study_id_string, user_id, data_stream_string, survey_id, timestamp_string, file_extension) if len(s3_list_files(s3_file_path)) != 0: print "ALREADY_EXISTS: %s, %s" % (timings_path, s3_file_path) if forcibly_overwrite == False: continue else: print "yay!: ", s3_file_path contents = contents.encode("utf8") #maybe make this unicode-16? s3_upload(s3_file_path, contents, study_obj_id, raw_path=True) FileToProcess.append_file_for_processing( s3_file_path, study_obj_id, user_id )
def reindex_all_files_to_process(): """ Totally removes the FilesToProcess DB, deletes all chunked files on s3, clears the chunksregistry, and then adds all relevent files on s3 to the files to process registry. """ FileProcessLock.lock() print str(datetime.now()), "purging FilesToProcess:", FilesToProcess.count() FileToProcess.db().drop() print str(datetime.now()), "purging existing ChunksRegistry", ChunksRegistry.count() ChunkRegistry.db().drop() pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2 ) print str(datetime.now()), "deleting older chunked data:", CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER) print len(CHUNKED_DATA) pool.map(s3_delete, CHUNKED_DATA) del CHUNKED_DATA print str(datetime.now()), "pulling new files to process..." files_lists = pool.map(s3_list_files, [str(s._id) for s in Studies()] ) print "putting new files to process..." for i,l in enumerate(files_lists): print str(datetime.now()), i+1, "of", str(Studies.count()) + ",", len(l), "files" for fp in l: if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS: FileToProcess.append_file_for_processing(fp, ObjectId(fp.split("/", 1)[0]), fp.split("/", 2)[1]) del files_lists, l pool.close() pool.terminate() print str(datetime.now()), "processing data." FileProcessLock.unlock() process_file_chunks()
def count_study_chunks(): chunked_data = s3_list_files("CHUNKED_DATA") study_prefixes = { f[:38] for f in chunked_data } study_prefix_to_id = { study_prefix: ObjectId(study_prefix.split("/")[-2]) for study_prefix in study_prefixes } study_prefix_to_name= { study_prefix:Study(_id=study_id).name for study_prefix, study_id in study_prefix_to_id.items() } study_count = { study_prefix_to_name[study_prefix]: len([f for f in chunked_data if f[:38] == study_prefix]) for study_prefix in study_prefixes } return study_count
def check_for_bad_chunks(): """ This function runs through all chunkable data and checks for invalid file pointers to s3. """ chunked_file_paths = set(s3_list_files("CHUNKED_DATA")) bad_chunks = [] for entry in ChunkRegistry.objects.all(): if entry.data_type in CHUNKABLE_FILES and entry.chunk_path not in chunked_file_paths: bad_chunks.append(entry) print("bad chunks:", len(bad_chunks))
def reindex_all_files_to_process(): """ Totally clears the FilesToProcess DB, deletes all chunked files on S3, clears the ChunksRegistry DB, reads all relevant files on S3 to the FilesToProcess registry and then re-chunks them. """ raise Exception( "This code has not been tested since converting database backends, that means 2018" ) # Delete all preexisting FTP and ChunkRegistry objects FileProcessLock.lock() print('{!s} purging FileToProcess: {:d}'.format( datetime.now(), FileToProcess.objects.count())) FileToProcess.objects.all().delete() print('{!s} purging ChunkRegistry: {:d}'.format( datetime.now(), ChunkRegistry.objects.count())) ChunkRegistry.objects.all().delete() pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2) # Delete all preexisting chunked data files CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER) print('{!s} deleting older chunked data: {:d}'.format( datetime.now(), len(CHUNKED_DATA))) pool.map(s3_delete, CHUNKED_DATA) del CHUNKED_DATA # Get a list of all S3 files to replace in the database print('{!s} pulling new files to process...'.format(datetime.now())) files_lists = pool.map(s3_list_files, Study.objects.values_list('object_id', flat=True)) # For each such file, create an FTP object print("putting new files to process...") for i, l in enumerate(files_lists): print('{!s} {:d} of {:d}, {:d} files'.format(datetime.now(), i + 1, Study.objects.count(), len(l))) for fp in l: if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS: patient_id = fp.split('/', 2)[1] participant_pk = Participant.objects.filter( patient_id=patient_id).values_list('pk', flat=True).get() FileToProcess.append_file_for_processing( fp, fp.split("/", 1)[0], participant_id=participant_pk) # Clean up by deleting large variables, closing the thread pool and unlocking the file process lock del files_lists, l pool.close() pool.terminate() FileProcessLock.unlock() # Rechunk the newly created FTPs print("{!s} processing data.".format(datetime.now())) process_file_chunks()
def grab_file_names(study_id, survey_id, user_id, number_points): """ Takes a list, returns a list of those most recent files.""" # this is correct - we want to convert these values to strings, not coerce them, that causes them # to be converted to strings with a preceeding b and in single quotes. study_id = study_id if not isinstance(study_id, bytes) else study_id.decode() survey_id = survey_id if not isinstance(survey_id, bytes) else survey_id.decode() user_id = user_id if not isinstance(user_id, bytes) else user_id.decode() number_points = number_points if not isinstance( number_points, bytes) else number_points.decode() all_files = s3_list_files("%s/%s/surveyAnswers/%s" % (str(study_id), str(user_id), str(survey_id))) return sorted(all_files[-number_points:])
def count_study_chunks(): chunked_file_paths = s3_list_files("CHUNKED_DATA") # The file paths start with CHUNKED_DATA/[24-digit object ID] study_prefixes = [f[:38] for f in chunked_file_paths] study_prefix_to_id = { study_prefix: study_prefix.split("/")[-2] for study_prefix in study_prefixes } study_prefix_to_name = { study_prefix: Study.objects.get(object_id=study_object_id).name for study_prefix, study_object_id in study_prefix_to_id.iteritems() } print(study_prefix_to_name) study_count = { study_prefix_to_name[study_prefix]: len([f for f in chunked_file_paths if f[:38] == study_prefix]) for study_prefix in study_prefixes } return study_count
def reprocess_originals_from_chunk_path(cls, chunk_path): path_components = chunk_path.split("/") if len(path_components) != 5: raise Exception( "chunked file paths contain exactly 5 components separated by a slash." ) chunk_files_text, study_obj_id, username, data_stream, timestamp = path_components if not chunk_files_text == CHUNKS_FOLDER: raise Exception( "This is not a chunked file, it is not in the chunked data folder." ) participant = Participant.objects.get(patient_id=username) # data stream names are truncated full_data_stream = REVERSE_UPLOAD_FILE_TYPE_MAPPING[data_stream] # oh good, identifiers doesn't end in a slash. splitter_end_char = '_' if full_data_stream == IDENTIFIERS else '/' file_prefix = "/".join(( study_obj_id, username, full_data_stream, )) + splitter_end_char print("searching:", file_prefix) # find all files with data from the appropriate time. dt_start = datetime.strptime(timestamp.strip(".csv"), API_TIME_FORMAT) dt_prev = dt_start - timedelta(hours=1) dt_end = dt_start + timedelta(hours=1) prior_hour_last_file = None file_paths_to_reprocess = [] for s3_file_path in s3_list_files(file_prefix, as_generator=False): # convert timestamp.... if full_data_stream == IDENTIFIERS: file_timestamp = float( s3_file_path.rsplit(splitter_end_char)[-1][:-4]) else: file_timestamp = float( s3_file_path.rsplit(splitter_end_char)[-1][:-4]) / 1000 file_dt = datetime.fromtimestamp(file_timestamp) # we need to get the last file from the prior hour as it my have relevant data, # fortunately returns of file paths are in ascending order, so it is the file # right before the rest of the data. just cache it if dt_prev <= file_dt < dt_start: prior_hour_last_file = s3_file_path # and then every file within the relevant hour if dt_start <= file_dt <= dt_end: print("found:", s3_file_path) file_paths_to_reprocess.append(s3_file_path) # a "should be an unnecessary" safety check if prior_hour_last_file and prior_hour_last_file not in file_paths_to_reprocess: print("found:", prior_hour_last_file) file_paths_to_reprocess.append(prior_hour_last_file) if not prior_hour_last_file and not file_paths_to_reprocess: raise Exception( f"did not find any matching files: '{chunk_path}' using prefix '{file_prefix}'" ) for fp in file_paths_to_reprocess: if cls.objects.filter(s3_file_path=fp).exists(): print(f"{fp} is already queued for processing") continue else: print(f"Adding {fp} as a file to reprocess.") cls.append_file_for_processing(fp, study_obj_id, participant=participant)
def grab_file_names(study_id, survey_id, user_id, number_points): """ Takes a list, returns a list of those most recent files.""" all_files = s3_list_files("%s/%s/surveyAnswers/%s" %(str(study_id), str(user_id), str(survey_id))) return sorted( all_files[ -number_points: ] )
if __name__ == "__main__": from os.path import abspath as _abspath import imp as _imp _current_folder_init = _abspath(__file__).rsplit('/', 1)[0]+ "/__init__.py" _imp.load_source("__init__", _current_folder_init) from libs.s3 import s3_list_files from db.data_access_models import FileToProcess, FilesToProcess from bson import ObjectId study_id_obj = ObjectId("5873fe38644ad7557b168e43") study_id_str = str(study_id_obj) for purgeable in FilesToProcess(user_id='prx7ap5x'): purgeable.remove() for i, path in enumerate(s3_list_files(study_id_str , as_generator=True)): if i > 500: break if path[-3:] != 'csv': continue # skip if not a csv file... user_id = path[:-4].split('/')[1] path_sans_study = path.split("/", 1)[1] if FileToProcess(s3_file_path=path): print "%s already in FilesToProcess." % path continue FileToProcess.append_file_for_processing(path_sans_study, study_id_obj, user_id)