def main(): participants = Participant.objects.filter( files_to_process__isnull=False).distinct() for participant in participants: print(participant.as_native_python()) print(participant.files_to_process.all()[0:250]) print len(participant.files_to_process.all()) for each in participant.files_to_process.all()[0:250]: ftp_as_object = each ftp = ftp_as_object.as_dict() print(ftp) data_type = 'gps' ret = { 'ftp': ftp, "data_type": data_type, 'exception': None, "file_contents": "", "traceback": None } print(ftp['s3_file_path'] + "\ngetting data...") ret['file_contents'] = s3_retrieve(ftp['s3_file_path'], ftp["study"].object_id, raw_path=True) print("finished getting data") print(ret['file_contents'])
def batch_retrieve_for_processing(ftp_as_object): """ Used for mapping an s3_retrieve function. """ # Convert the ftp object to a dict so we can use __getattr__ ftp = ftp_as_object.as_dict() data_type = file_path_to_data_type(ftp['s3_file_path']) # Create a dictionary to populate and return ret = { 'ftp': ftp, "data_type": data_type, 'exception': None, "file_contents": "", "traceback": None } if data_type in CHUNKABLE_FILES: ret['chunkable'] = True # Try to retrieve the file contents. If any errors are raised, store them to be raised by the parent function try: print(ftp['s3_file_path'] + "\ngetting data...") ret['file_contents'] = s3_retrieve(ftp['s3_file_path'], ftp["study"].object_id, raw_path=True) except Exception as e: ret['traceback'] = format_exc(e) ret['exception'] = e else: # We don't do anything with unchunkable data. ret['chunkable'] = False ret['file_contents'] = "" return ret
def get_survey_results(study_id, user_id, survey_id, number_points=7): """ Compiles 2 weeks (14 points) of config from s3 for a given patient into config points for displaying on the device. Result is a list of lists, inner list[0] is the title/question text, inner list[1] is a list of y coordinates. """ # Get files from s3 for user answers, convert each csv_file to a list of dicts, # pull the questions and corresponding answers. files = grab_file_names(study_id, survey_id, user_id, number_points) # note: we need to remove the prepended study id from file name strings, # that string is always 24 characters + the slash. surveys = [ csv_to_dict(s3_retrieve(file_name[25:], study_id)) for file_name in files ] all_questions = compile_question_data(surveys) all_answers = pull_answers(surveys, all_questions) # all answers may be identical to all questions at this point. # turn the data into a list of lists that javascript can actually handle. questions_answers = sorted( all_answers.values()) # pulls out question text and answers. result = [] for question in questions_answers: # maps question answers to data points for question_text, corresponding_answers in question.items(): result.append([question_text, corresponding_answers]) return jsonify_survey_results(result)
def batch_retrieve_for_processing(ftp_as_object: FileToProcess) -> dict: """ Used for mapping an s3_retrieve function. """ # Convert the ftp object to a dict so we can use __getattr__ ftp = ftp_as_object.as_dict() data_type = file_path_to_data_type(ftp['s3_file_path']) # Create a dictionary to populate and return ret = { 'ftp': ftp, "data_type": data_type, 'exception': None, "file_contents": "", "traceback": None, 'chunkable': data_type in CHUNKABLE_FILES, } # Try to retrieve the file contents. If any errors are raised, store them to be raised by the # parent function try: print(ftp['s3_file_path'] + ", getting data...") ret['file_contents'] = s3_retrieve(ftp['s3_file_path'], ftp["study"].object_id.encode(), raw_path=True) except Exception as e: traceback.print_exc() ret['traceback'] = sys.exc_info() ret['exception'] = e return ret
def batch_retrieve_for_processing(ftp): """ Used for mapping an s3_retrieve function. """ data_type = file_path_to_data_type(ftp['s3_file_path']) ret = { 'ftp': ftp, "data_type": data_type, 'exception': None, "file_contents": "", "traceback": None } if data_type in CHUNKABLE_FILES: ret['chunkable'] = True try: #handle s3 errors # print ftp['s3_file_path'], "\ngetting data..." ret['file_contents'] = s3_retrieve(ftp['s3_file_path'], ftp["study_id"], raw_path=True) # print "finished getting data" except Exception as e: ret['traceback'] = format_exc(e) ret['exception'] = e else: #We don't do anything with unchunkable data. ret['chunkable'] = False ret['data'] = "" return ret
def batch_retrieve_s3(chunk): """ Data is returned in the form (chunk_object, file_data). """ return chunk, s3_retrieve( chunk["chunk_path"], study_object_id=Study.objects.get(id=chunk["study_id"]).object_id, raw_path=True )
def test_s3_upload(self): study = Study(object_id='0vsvxgyx5skpI0ndOSAk1Duf', encryption_key='aabbccddefggiijjkklmnoppqqrrsstt', name='TEST_STUDY_FOR_TESTS') study.save() test_data = "THIS IS TEST DATA" s3_upload("test_file_for_tests.txt", test_data, study.object_id) s3_data = s3_retrieve("test_file_for_tests.txt", study.object_id) self.assertEqual(s3_data, test_data)
def get_client_private_key(patient_id, study_id): """Grabs a user's private key file from s3.""" key_pair_paths = construct_s3_key_paths(study_id, patient_id) try: key = s3_retrieve(key_pair_paths['private'], study_id, raw_path=True) except: print('Could not find key {0} in {1}'.format('private', key_pair_paths)) raise return encryption.import_RSA_key(key)
def batch_retrieve_pipeline_s3(pipeline_upload): """ Data is returned in the form (chunk_object, file_data). """ return pipeline_upload, s3_retrieve(pipeline_upload["s3_path"], pipeline_upload["study_id"], raw_path=True) # class dummy_threadpool(): # def imap_unordered(self, *args, **kwargs): #the existance of that self variable is key # # we actually want to cut off any threadpool args, which is conveniently easy because map does not use kwargs! # return map(*args) # def terminate(self): pass # def close(self): pass
def download_file_contents(self) -> bytes or None: """ Handles network errors and updates state accordingly """ # Try to retrieve the file contents. If any errors are raised, store them to be raised by the # parent function try: self.file_contents = s3_retrieve( self.file_to_process.s3_file_path, self.file_to_process.study.object_id, raw_path=True ) except Exception as e: traceback.print_exc() self.traceback = sys.exc_info() self.exception = e raise SomeException(e)
def main(): files = UploadTracking.objects.filter(file_path__contains="jfcztjpm/gps", timestamp__contains="2018-07-07") study_id = "trZYX7jrpn52YFdkqkmlfJqj" for each in files: ftp_as_object = each ftp = ftp_as_object.as_dict() print(ftp) data_type = 'gps' ret = { 'ftp': ftp, "data_type": data_type, 'exception': None, "file_contents": "", "traceback": None } print(study_id + "/" + ftp['file_path'] + "\ngetting data...") ret['file_contents'] = s3_retrieve(study_id + "/" + ftp['file_path'], study_id, raw_path=True) print("finished getting data") print(ret['file_contents'])
def check_and_update_number_of_observations(chunk): if chunk.data_type == VOICE_RECORDING: chunk.number_of_observations = 1 chunk.save() else: file_contents = s3_retrieve(chunk.chunk_path, study_object_id=chunk.study.object_id, raw_path=True) # we want to make sure that there are no extraneous newline characters at the # end of the line. we want the line to end in exactly one newline character file_contents = file_contents.rstrip('\n') + '\n' # we subtract one to exclude the header line chunk.number_of_observations = file_contents.count('\n') - 1 chunk.save() print('Updated chunk {0} with {1} observations'.format( chunk, chunk.number_of_observations)) return
def batch_retrieve_pipeline_s3(pipeline_upload): """ Data is returned in the form (chunk_object, file_data). """ study = Study.objects.get(id = pipeline_upload.study_id) return pipeline_upload, s3_retrieve(pipeline_upload.s3_path, study.object_id, raw_path=True)
def get_client_public_key_string(patient_id, study_id): """Grabs a user's public key string from s3.""" key_pair_paths = construct_s3_key_paths(study_id, patient_id) key_string = s3_retrieve(key_pair_paths['public'], study_id, raw_path=True) return encryption.prepare_X509_key_for_java(key_string)
def upload_binified_data(binified_data, error_handler, survey_id_dict): """ Takes in binified csv data and handles uploading/downloading+updating older data to/from S3 for each chunk. Returns a set of concatenations that have succeeded and can be removed. Returns the number of failed FTPS so that we don't retry them. Raises any errors on the passed in ErrorHandler.""" failed_ftps = set([]) ftps_to_retire = set([]) upload_these = [] for data_bin, (data_rows_list, ftp_list) in binified_data.items(): with error_handler: try: study_object_id, user_id, data_type, time_bin, original_header = data_bin # data_rows_list may be a generator; here it is evaluated rows = data_rows_list updated_header = convert_unix_to_human_readable_timestamps(original_header, rows) chunk_path = construct_s3_chunk_path(study_object_id, user_id, data_type, time_bin) if ChunkRegistry.objects.filter(chunk_path=chunk_path).exists(): chunk = ChunkRegistry.objects.get(chunk_path=chunk_path) try: s3_file_data = s3_retrieve(chunk_path, study_object_id, raw_path=True) except ReadTimeoutError as e: # The following check was correct for boto 2, still need to hit with boto3 test. if "The specified key does not exist." == str(e): # This error can only occur if the processing gets actually interrupted and # data files fail to upload after DB entries are created. # Encountered this condition 11pm feb 7 2016, cause unknown, there was # no python stacktrace. Best guess is mongo blew up. # If this happened, delete the ChunkRegistry and push this file upload to the next cycle chunk.remove() # this line of code is ancient and almost definitely wrong. raise ChunkFailedToExist("chunk %s does not actually point to a file, deleting DB entry, should run correctly on next index." % chunk_path) raise # Raise original error if not 404 s3 error old_header, old_rows = csv_to_list(s3_file_data) if old_header != updated_header: # To handle the case where a file was on an hour boundary and placed in # two separate chunks we need to raise an error in order to retire this file. If this # happens AND ONE of the files DOES NOT have a header mismatch this may ( # will?) cause data duplication in the chunked file whenever the file # processing occurs run. raise HeaderMismatchException('%s\nvs.\n%s\nin\n%s' % (old_header, updated_header, chunk_path) ) old_rows = list(old_rows) old_rows.extend(rows) del rows ensure_sorted_by_timestamp(old_rows) new_contents = construct_csv_string(updated_header, old_rows) del old_rows upload_these.append((chunk, chunk_path, compress(new_contents), study_object_id)) del new_contents else: ensure_sorted_by_timestamp(rows) new_contents = construct_csv_string(updated_header, rows) if data_type in SURVEY_DATA_FILES: # We need to keep a mapping of files to survey ids, that is handled here. survey_id_hash = study_object_id, user_id, data_type, original_header survey_id = Survey.objects.filter( object_id=survey_id_dict[survey_id_hash]).values_list("pk", flat=True).get() else: survey_id = None # this object will eventually get **kwarg'd into ChunkRegistry.register_chunked_data chunk_params = { "study_id": Study.objects.filter(object_id=study_object_id).values_list("pk", flat=True).get(), "participant_id": Participant.objects.filter(patient_id=user_id).values_list("pk", flat=True).get(), "data_type": data_type, "chunk_path": chunk_path, "time_bin": time_bin, "survey_id": survey_id } upload_these.append((chunk_params, chunk_path, compress(new_contents), study_object_id)) except Exception as e: # Here we catch any exceptions that may have arisen, as well as the ones that we raised # ourselves (e.g. HeaderMismatchException). Whichever FTP we were processing when the # exception was raised gets added to the set of failed FTPs. failed_ftps.update(ftp_list) print(e) print("FAILED TO UPDATE: study_id:%s, user_id:%s, data_type:%s, time_bin:%s, header:%s " % (study_object_id, user_id, data_type, time_bin, updated_header)) raise else: # If no exception was raised, the FTP has completed processing. Add it to the set of # retireable (i.e. completed) FTPs. ftps_to_retire.update(ftp_list) pool = ThreadPool(CONCURRENT_NETWORK_OPS) errors = pool.map(batch_upload, upload_these, chunksize=1) for err_ret in errors: if err_ret['exception']: print(err_ret['traceback']) raise err_ret['exception'] pool.close() pool.terminate() # The things in ftps to retire that are not in failed ftps. # len(failed_ftps) will become the number of files to skip in the next iteration. return ftps_to_retire.difference(failed_ftps), len(failed_ftps)
def get_client_public_key(patient_id, study_id): """Grabs a user's public key file from s3.""" key_pair_paths = construct_s3_key_paths(study_id, patient_id) key = s3_retrieve(key_pair_paths['public'], study_id, raw_path=True) return encryption.import_RSA_key(key)
def batch_retrieve_s3(chunk): """ Data is returned in the form (chunk_object, file_data). """ return chunk, s3_retrieve(chunk["chunk_path"], chunk["study_id"], raw_path=True)
def upload_binified_data(binified_data, error_handler, survey_id_dict): """ Takes in binified csv data and handles uploading/downloading+updating older data to/from S3 for each chunk. Returns a set of concatenations that have succeeded and can be removed. Returns the number of failed FTPS so that we don't retry them. Raises any errors on the passed in ErrorHandler.""" failed_ftps = set([]) ftps_to_retire = set([]) upload_these = [] for data_bin, (data_rows_deque, ftp_deque) in binified_data.iteritems(): # print 3 with error_handler: try: # print 4 study_id, user_id, data_type, time_bin, original_header = data_bin # print 5 # data_rows_deque may be a generator; here it is evaluated rows = list(data_rows_deque) updated_header = convert_unix_to_human_readable_timestamps( original_header, rows) # print 6 chunk_path = construct_s3_chunk_path(study_id, user_id, data_type, time_bin) # print 7 old_chunk_exists = ChunkRegistry.objects.filter( chunk_path=chunk_path).exists() if old_chunk_exists: chunk = ChunkRegistry.objects.get(chunk_path=chunk_path) try: # print 8 # print chunk_path s3_file_data = s3_retrieve(chunk_path, study_id, raw_path=True) # print "finished s3 retrieve" except S3ResponseError as e: # print 9 # The following check is correct for boto version 2.38.0 if "The specified key does not exist." == e.message: # This error can only occur if the processing gets actually interrupted and # data files fail to upload after DB entries are created. # Encountered this condition 11pm feb 7 2016, cause unknown, there was # no python stacktrace. Best guess is mongo blew up. # If this happened, delete the ChunkRegistry and push this file upload to the next cycle chunk.remove() raise ChunkFailedToExist( "chunk %s does not actually point to a file, deleting DB entry, should run correctly on next index." % chunk_path) raise # Raise original error if not 404 s3 error # print 10 old_header, old_rows = csv_to_list(s3_file_data) if old_header != updated_header: # To handle the case where a file was on an hour boundary and placed in # two separate chunks we need to raise an error in order to retire this file. If this # happens AND ONE of the files DOES NOT have a header mismatch this may ( # will?) cause data duplication in the chunked file whenever the file # processing occurs run. raise HeaderMismatchException( '%s\nvs.\n%s\nin\n%s' % (old_header, updated_header, chunk_path)) # print 11 old_rows = [_ for _ in old_rows] # print "11a" # This is O(1), which is why we use a deque (double-ended queue) old_rows.extend(rows) # print "11b" del rows # print 12 ensure_sorted_by_timestamp(old_rows) # print 13 if data_type == SURVEY_TIMINGS: # print "13a" new_contents = construct_utf_safe_csv_string( updated_header, old_rows) else: # print "13b" new_contents = construct_csv_string( updated_header, old_rows) del old_rows # print 14 upload_these.append((chunk, chunk_path, new_contents.encode("zip"), study_id)) del new_contents else: # print "7a" ensure_sorted_by_timestamp(rows) # print "7b" if data_type == SURVEY_TIMINGS: # print "7ba" new_contents = construct_utf_safe_csv_string( updated_header, rows) else: # print "7bc" new_contents = construct_csv_string( updated_header, rows) # print "7c" if data_type in SURVEY_DATA_FILES: # We need to keep a mapping of files to survey ids, that is handled here. # print "7da" survey_id_hash = study_id, user_id, data_type, original_header survey_id = survey_id_dict[survey_id_hash] # print survey_id_hash else: # print "7db" survey_id = None # print "7e" chunk_params = { "study_id": study_id, "user_id": user_id, "data_type": data_type, "chunk_path": chunk_path, "time_bin": time_bin, "survey_id": survey_id } upload_these.append((chunk_params, chunk_path, new_contents.encode("zip"), study_id)) except Exception as e: # Here we catch any exceptions that may have arisen, as well as the ones that we raised # ourselves (e.g. HeaderMismatchException). Whichever FTP we were processing when the # exception was raised gets added to the set of failed FTPs. failed_ftps.update(ftp_deque) print(e) print( "failed to update: study_id:%s, user_id:%s, data_type:%s, time_bin:%s, header:%s " % (study_id, user_id, data_type, time_bin, updated_header)) raise else: # If no exception was raised, the FTP has completed processing. Add it to the set of # retireable (i.e. completed) FTPs. ftps_to_retire.update(ftp_deque) pool = ThreadPool(CONCURRENT_NETWORK_OPS) errors = pool.map(batch_upload, upload_these, chunksize=1) for err_ret in errors: if err_ret['exception']: print(err_ret['traceback']) raise err_ret['exception'] pool.close() pool.terminate() # The things in ftps to retire that are not in failed ftps. # len(failed_ftps) will become the number of files to skip in the next iteration. return ftps_to_retire.difference(failed_ftps), len(failed_ftps)
def batch_retrieve( parameters ): # need to handle parameters, ensure unicode return s3_retrieve( *parameters, raw_path=True ).decode( "utf8" ), \ parameters[0]
def s3_retrieve(self): return s3_retrieve(self.chunk_path, self.study.object_id)