Exemple #1
0
def main():
    participants = Participant.objects.filter(
        files_to_process__isnull=False).distinct()
    for participant in participants:
        print(participant.as_native_python())
        print(participant.files_to_process.all()[0:250])
        print len(participant.files_to_process.all())
        for each in participant.files_to_process.all()[0:250]:
            ftp_as_object = each
            ftp = ftp_as_object.as_dict()
            print(ftp)
            data_type = 'gps'
            ret = {
                'ftp': ftp,
                "data_type": data_type,
                'exception': None,
                "file_contents": "",
                "traceback": None
            }
            print(ftp['s3_file_path'] + "\ngetting data...")
            ret['file_contents'] = s3_retrieve(ftp['s3_file_path'],
                                               ftp["study"].object_id,
                                               raw_path=True)
            print("finished getting data")
            print(ret['file_contents'])
Exemple #2
0
def batch_retrieve_for_processing(ftp_as_object):
    """ Used for mapping an s3_retrieve function. """
    # Convert the ftp object to a dict so we can use __getattr__
    ftp = ftp_as_object.as_dict()

    data_type = file_path_to_data_type(ftp['s3_file_path'])

    # Create a dictionary to populate and return
    ret = {
        'ftp': ftp,
        "data_type": data_type,
        'exception': None,
        "file_contents": "",
        "traceback": None
    }
    if data_type in CHUNKABLE_FILES:
        ret['chunkable'] = True
        # Try to retrieve the file contents. If any errors are raised, store them to be raised by the parent function
        try:
            print(ftp['s3_file_path'] + "\ngetting data...")
            ret['file_contents'] = s3_retrieve(ftp['s3_file_path'],
                                               ftp["study"].object_id,
                                               raw_path=True)
        except Exception as e:
            ret['traceback'] = format_exc(e)
            ret['exception'] = e
    else:
        # We don't do anything with unchunkable data.
        ret['chunkable'] = False
        ret['file_contents'] = ""
    return ret
Exemple #3
0
def get_survey_results(study_id, user_id, survey_id, number_points=7):
    """ Compiles 2 weeks (14 points) of config from s3 for a given patient into config points for
    displaying on the device.
    Result is a list of lists, inner list[0] is the title/question text, inner list[1] is a
    list of y coordinates. """
    # Get files from s3 for user answers, convert each csv_file to a list of dicts,
    # pull the questions and corresponding answers.
    files = grab_file_names(study_id, survey_id, user_id, number_points)
    # note: we need to remove the prepended study id from file name strings,
    # that string is always 24 characters + the slash.
    surveys = [
        csv_to_dict(s3_retrieve(file_name[25:], study_id))
        for file_name in files
    ]
    all_questions = compile_question_data(surveys)
    all_answers = pull_answers(surveys, all_questions)
    # all answers may be identical to all questions at this point.
    # turn the data into a list of lists that javascript can actually handle.
    questions_answers = sorted(
        all_answers.values())  # pulls out question text and answers.
    result = []
    for question in questions_answers:  # maps question answers to data points
        for question_text, corresponding_answers in question.items():
            result.append([question_text, corresponding_answers])
    return jsonify_survey_results(result)
Exemple #4
0
def batch_retrieve_for_processing(ftp_as_object: FileToProcess) -> dict:
    """ Used for mapping an s3_retrieve function. """
    # Convert the ftp object to a dict so we can use __getattr__
    ftp = ftp_as_object.as_dict()

    data_type = file_path_to_data_type(ftp['s3_file_path'])

    # Create a dictionary to populate and return
    ret = {
        'ftp': ftp,
        "data_type": data_type,
        'exception': None,
        "file_contents": "",
        "traceback": None,
        'chunkable': data_type in CHUNKABLE_FILES,
    }

    # Try to retrieve the file contents. If any errors are raised, store them to be raised by the
    # parent function
    try:
        print(ftp['s3_file_path'] + ", getting data...")
        ret['file_contents'] = s3_retrieve(ftp['s3_file_path'],
                                           ftp["study"].object_id.encode(),
                                           raw_path=True)
    except Exception as e:
        traceback.print_exc()
        ret['traceback'] = sys.exc_info()
        ret['exception'] = e
    return ret
def batch_retrieve_for_processing(ftp):
    """ Used for mapping an s3_retrieve function. """
    data_type = file_path_to_data_type(ftp['s3_file_path'])
    ret = {
        'ftp': ftp,
        "data_type": data_type,
        'exception': None,
        "file_contents": "",
        "traceback": None
    }
    if data_type in CHUNKABLE_FILES:
        ret['chunkable'] = True
        try:  #handle s3 errors
            # print ftp['s3_file_path'], "\ngetting data..."
            ret['file_contents'] = s3_retrieve(ftp['s3_file_path'],
                                               ftp["study_id"],
                                               raw_path=True)
            # print "finished getting data"
        except Exception as e:
            ret['traceback'] = format_exc(e)
            ret['exception'] = e
    else:
        #We don't do anything with unchunkable data.
        ret['chunkable'] = False
        ret['data'] = ""
    return ret
Exemple #6
0
def batch_retrieve_s3(chunk):
    """ Data is returned in the form (chunk_object, file_data). """
    return chunk, s3_retrieve(
        chunk["chunk_path"],
        study_object_id=Study.objects.get(id=chunk["study_id"]).object_id,
        raw_path=True
    )
Exemple #7
0
 def test_s3_upload(self):
     study = Study(object_id='0vsvxgyx5skpI0ndOSAk1Duf',
                   encryption_key='aabbccddefggiijjkklmnoppqqrrsstt',
                   name='TEST_STUDY_FOR_TESTS')
     study.save()
     test_data = "THIS IS TEST DATA"
     s3_upload("test_file_for_tests.txt", test_data, study.object_id)
     s3_data = s3_retrieve("test_file_for_tests.txt", study.object_id)
     self.assertEqual(s3_data, test_data)
Exemple #8
0
def get_client_private_key(patient_id, study_id):
    """Grabs a user's private key file from s3."""
    key_pair_paths = construct_s3_key_paths(study_id, patient_id)
    try:
        key = s3_retrieve(key_pair_paths['private'], study_id, raw_path=True)
    except:
        print('Could not find key {0} in {1}'.format('private',
                                                     key_pair_paths))
        raise

    return encryption.import_RSA_key(key)
Exemple #9
0
def batch_retrieve_pipeline_s3(pipeline_upload):
    """ Data is returned in the form (chunk_object, file_data). """
    return pipeline_upload, s3_retrieve(pipeline_upload["s3_path"],
                                        pipeline_upload["study_id"],
                                        raw_path=True)


# class dummy_threadpool():
#     def imap_unordered(self, *args, **kwargs): #the existance of that self variable is key
#         # we actually want to cut off any threadpool args, which is conveniently easy because map does not use kwargs!
#         return map(*args)
#     def terminate(self): pass
#     def close(self): pass
 def download_file_contents(self) -> bytes or None:
     """ Handles network errors and updates state accordingly """
     # Try to retrieve the file contents. If any errors are raised, store them to be raised by the
     # parent function
     try:
         self.file_contents = s3_retrieve(
             self.file_to_process.s3_file_path,
             self.file_to_process.study.object_id,
             raw_path=True
         )
     except Exception as e:
         traceback.print_exc()
         self.traceback = sys.exc_info()
         self.exception = e
         raise SomeException(e)
Exemple #11
0
def main():
    files = UploadTracking.objects.filter(file_path__contains="jfcztjpm/gps",
                                          timestamp__contains="2018-07-07")
    study_id = "trZYX7jrpn52YFdkqkmlfJqj"
    for each in files:
        ftp_as_object = each
        ftp = ftp_as_object.as_dict()
        print(ftp)
        data_type = 'gps'
        ret = {
            'ftp': ftp,
            "data_type": data_type,
            'exception': None,
            "file_contents": "",
            "traceback": None
        }
        print(study_id + "/" + ftp['file_path'] + "\ngetting data...")
        ret['file_contents'] = s3_retrieve(study_id + "/" + ftp['file_path'],
                                           study_id,
                                           raw_path=True)
        print("finished getting data")
        print(ret['file_contents'])
Exemple #12
0
def check_and_update_number_of_observations(chunk):

    if chunk.data_type == VOICE_RECORDING:
        chunk.number_of_observations = 1
        chunk.save()
    else:
        file_contents = s3_retrieve(chunk.chunk_path,
                                    study_object_id=chunk.study.object_id,
                                    raw_path=True)

        # we want to make sure that there are no extraneous newline characters at the
        # end of the line. we want the line to end in exactly one newline character
        file_contents = file_contents.rstrip('\n') + '\n'

        # we subtract one to exclude the header line
        chunk.number_of_observations = file_contents.count('\n') - 1
        chunk.save()

    print('Updated chunk {0} with {1} observations'.format(
        chunk, chunk.number_of_observations))

    return
Exemple #13
0
def batch_retrieve_pipeline_s3(pipeline_upload):
    """ Data is returned in the form (chunk_object, file_data). """
    study = Study.objects.get(id = pipeline_upload.study_id)
    return pipeline_upload, s3_retrieve(pipeline_upload.s3_path,
                                        study.object_id,
                                        raw_path=True)
Exemple #14
0
def get_client_public_key_string(patient_id, study_id):
    """Grabs a user's public key string from s3."""
    key_pair_paths = construct_s3_key_paths(study_id, patient_id)
    key_string = s3_retrieve(key_pair_paths['public'], study_id, raw_path=True)
    return encryption.prepare_X509_key_for_java(key_string)
Exemple #15
0
def upload_binified_data(binified_data, error_handler, survey_id_dict):
    """ Takes in binified csv data and handles uploading/downloading+updating
        older data to/from S3 for each chunk.
        Returns a set of concatenations that have succeeded and can be removed.
        Returns the number of failed FTPS so that we don't retry them.
        Raises any errors on the passed in ErrorHandler."""
    failed_ftps = set([])
    ftps_to_retire = set([])
    upload_these = []
    for data_bin, (data_rows_list, ftp_list) in binified_data.items():
        with error_handler:
            try:
                study_object_id, user_id, data_type, time_bin, original_header = data_bin
                # data_rows_list may be a generator; here it is evaluated
                rows = data_rows_list
                updated_header = convert_unix_to_human_readable_timestamps(original_header, rows)
                chunk_path = construct_s3_chunk_path(study_object_id, user_id, data_type, time_bin)

                if ChunkRegistry.objects.filter(chunk_path=chunk_path).exists():
                    chunk = ChunkRegistry.objects.get(chunk_path=chunk_path)
                    try:
                        s3_file_data = s3_retrieve(chunk_path, study_object_id, raw_path=True)
                    except ReadTimeoutError as e:
                        # The following check was correct for boto 2, still need to hit with boto3 test.
                        if "The specified key does not exist." == str(e):
                            # This error can only occur if the processing gets actually interrupted and
                            # data files fail to upload after DB entries are created.
                            # Encountered this condition 11pm feb 7 2016, cause unknown, there was
                            # no python stacktrace.  Best guess is mongo blew up.
                            # If this happened, delete the ChunkRegistry and push this file upload to the next cycle
                            chunk.remove()  # this line of code is ancient and almost definitely wrong.
                            raise ChunkFailedToExist("chunk %s does not actually point to a file, deleting DB entry, should run correctly on next index." % chunk_path)
                        raise  # Raise original error if not 404 s3 error

                    old_header, old_rows = csv_to_list(s3_file_data)

                    if old_header != updated_header:
                        # To handle the case where a file was on an hour boundary and placed in
                        # two separate chunks we need to raise an error in order to retire this file. If this
                        # happens AND ONE of the files DOES NOT have a header mismatch this may (
                        # will?) cause data duplication in the chunked file whenever the file
                        # processing occurs run.
                        raise HeaderMismatchException('%s\nvs.\n%s\nin\n%s' %
                                                      (old_header, updated_header, chunk_path) )
                    old_rows = list(old_rows)
                    old_rows.extend(rows)
                    del rows
                    ensure_sorted_by_timestamp(old_rows)
                    new_contents = construct_csv_string(updated_header, old_rows)
                    del old_rows

                    upload_these.append((chunk, chunk_path, compress(new_contents), study_object_id))
                    del new_contents
                else:
                    ensure_sorted_by_timestamp(rows)
                    new_contents = construct_csv_string(updated_header, rows)
                    if data_type in SURVEY_DATA_FILES:
                        # We need to keep a mapping of files to survey ids, that is handled here.
                        survey_id_hash = study_object_id, user_id, data_type, original_header
                        survey_id = Survey.objects.filter(
                            object_id=survey_id_dict[survey_id_hash]).values_list("pk", flat=True).get()
                    else:
                        survey_id = None

                    # this object will eventually get **kwarg'd into ChunkRegistry.register_chunked_data
                    chunk_params = {
                        "study_id": Study.objects.filter(object_id=study_object_id).values_list("pk", flat=True).get(),
                        "participant_id": Participant.objects.filter(patient_id=user_id).values_list("pk", flat=True).get(),
                        "data_type": data_type,
                        "chunk_path": chunk_path,
                        "time_bin": time_bin,
                        "survey_id": survey_id
                    }

                    upload_these.append((chunk_params, chunk_path, compress(new_contents), study_object_id))
            except Exception as e:
                # Here we catch any exceptions that may have arisen, as well as the ones that we raised
                # ourselves (e.g. HeaderMismatchException). Whichever FTP we were processing when the
                # exception was raised gets added to the set of failed FTPs.
                failed_ftps.update(ftp_list)
                print(e)
                print("FAILED TO UPDATE: study_id:%s, user_id:%s, data_type:%s, time_bin:%s, header:%s "
                      % (study_object_id, user_id, data_type, time_bin, updated_header))
                raise

            else:
                # If no exception was raised, the FTP has completed processing. Add it to the set of
                # retireable (i.e. completed) FTPs.
                ftps_to_retire.update(ftp_list)

    pool = ThreadPool(CONCURRENT_NETWORK_OPS)
    errors = pool.map(batch_upload, upload_these, chunksize=1)
    for err_ret in errors:
        if err_ret['exception']:
            print(err_ret['traceback'])
            raise err_ret['exception']

    pool.close()
    pool.terminate()
    # The things in ftps to retire that are not in failed ftps.
    # len(failed_ftps) will become the number of files to skip in the next iteration.
    return ftps_to_retire.difference(failed_ftps), len(failed_ftps)
Exemple #16
0
def get_client_public_key(patient_id, study_id):
    """Grabs a user's public key file from s3."""
    key_pair_paths = construct_s3_key_paths(study_id, patient_id)
    key = s3_retrieve(key_pair_paths['public'], study_id, raw_path=True)
    return encryption.import_RSA_key(key)
def batch_retrieve_s3(chunk):
    """ Data is returned in the form (chunk_object, file_data). """
    return chunk, s3_retrieve(chunk["chunk_path"],
                              chunk["study_id"],
                              raw_path=True)
Exemple #18
0
def upload_binified_data(binified_data, error_handler, survey_id_dict):
    """ Takes in binified csv data and handles uploading/downloading+updating
        older data to/from S3 for each chunk.
        Returns a set of concatenations that have succeeded and can be removed.
        Returns the number of failed FTPS so that we don't retry them.
        Raises any errors on the passed in ErrorHandler."""
    failed_ftps = set([])
    ftps_to_retire = set([])
    upload_these = []
    for data_bin, (data_rows_deque, ftp_deque) in binified_data.iteritems():
        # print 3
        with error_handler:
            try:
                # print 4
                study_id, user_id, data_type, time_bin, original_header = data_bin
                # print 5
                # data_rows_deque may be a generator; here it is evaluated
                rows = list(data_rows_deque)
                updated_header = convert_unix_to_human_readable_timestamps(
                    original_header, rows)
                # print 6
                chunk_path = construct_s3_chunk_path(study_id, user_id,
                                                     data_type, time_bin)
                # print 7
                old_chunk_exists = ChunkRegistry.objects.filter(
                    chunk_path=chunk_path).exists()
                if old_chunk_exists:
                    chunk = ChunkRegistry.objects.get(chunk_path=chunk_path)
                    try:
                        # print 8
                        # print chunk_path
                        s3_file_data = s3_retrieve(chunk_path,
                                                   study_id,
                                                   raw_path=True)
                        # print "finished s3 retrieve"
                    except S3ResponseError as e:
                        # print 9
                        # The following check is correct for boto version 2.38.0
                        if "The specified key does not exist." == e.message:
                            # This error can only occur if the processing gets actually interrupted and
                            # data files fail to upload after DB entries are created.
                            # Encountered this condition 11pm feb 7 2016, cause unknown, there was
                            # no python stacktrace.  Best guess is mongo blew up.
                            # If this happened, delete the ChunkRegistry and push this file upload to the next cycle
                            chunk.remove()
                            raise ChunkFailedToExist(
                                "chunk %s does not actually point to a file, deleting DB entry, should run correctly on next index."
                                % chunk_path)
                        raise  # Raise original error if not 404 s3 error
                    # print 10
                    old_header, old_rows = csv_to_list(s3_file_data)
                    if old_header != updated_header:
                        # To handle the case where a file was on an hour boundary and placed in
                        # two separate chunks we need to raise an error in order to retire this file. If this
                        # happens AND ONE of the files DOES NOT have a header mismatch this may (
                        # will?) cause data duplication in the chunked file whenever the file
                        # processing occurs run.
                        raise HeaderMismatchException(
                            '%s\nvs.\n%s\nin\n%s' %
                            (old_header, updated_header, chunk_path))
                    # print 11
                    old_rows = [_ for _ in old_rows]
                    # print "11a"
                    # This is O(1), which is why we use a deque (double-ended queue)
                    old_rows.extend(rows)
                    # print "11b"
                    del rows
                    # print 12
                    ensure_sorted_by_timestamp(old_rows)
                    # print 13

                    if data_type == SURVEY_TIMINGS:
                        # print "13a"
                        new_contents = construct_utf_safe_csv_string(
                            updated_header, old_rows)
                    else:
                        # print "13b"
                        new_contents = construct_csv_string(
                            updated_header, old_rows)
                    del old_rows
                    # print 14
                    upload_these.append((chunk, chunk_path,
                                         new_contents.encode("zip"), study_id))
                    del new_contents
                else:
                    # print "7a"
                    ensure_sorted_by_timestamp(rows)
                    # print "7b"
                    if data_type == SURVEY_TIMINGS:
                        # print "7ba"
                        new_contents = construct_utf_safe_csv_string(
                            updated_header, rows)
                    else:
                        # print "7bc"
                        new_contents = construct_csv_string(
                            updated_header, rows)
                    # print "7c"
                    if data_type in SURVEY_DATA_FILES:
                        # We need to keep a mapping of files to survey ids, that is handled here.
                        # print "7da"
                        survey_id_hash = study_id, user_id, data_type, original_header
                        survey_id = survey_id_dict[survey_id_hash]
                        # print survey_id_hash
                    else:
                        # print "7db"
                        survey_id = None
                    # print "7e"
                    chunk_params = {
                        "study_id": study_id,
                        "user_id": user_id,
                        "data_type": data_type,
                        "chunk_path": chunk_path,
                        "time_bin": time_bin,
                        "survey_id": survey_id
                    }
                    upload_these.append((chunk_params, chunk_path,
                                         new_contents.encode("zip"), study_id))
            except Exception as e:
                # Here we catch any exceptions that may have arisen, as well as the ones that we raised
                # ourselves (e.g. HeaderMismatchException). Whichever FTP we were processing when the
                # exception was raised gets added to the set of failed FTPs.
                failed_ftps.update(ftp_deque)
                print(e)
                print(
                    "failed to update: study_id:%s, user_id:%s, data_type:%s, time_bin:%s, header:%s "
                    % (study_id, user_id, data_type, time_bin, updated_header))
                raise
            else:
                # If no exception was raised, the FTP has completed processing. Add it to the set of
                # retireable (i.e. completed) FTPs.
                ftps_to_retire.update(ftp_deque)

    pool = ThreadPool(CONCURRENT_NETWORK_OPS)
    errors = pool.map(batch_upload, upload_these, chunksize=1)
    for err_ret in errors:
        if err_ret['exception']:
            print(err_ret['traceback'])
            raise err_ret['exception']

    pool.close()
    pool.terminate()
    # The things in ftps to retire that are not in failed ftps.
    # len(failed_ftps) will become the number of files to skip in the next iteration.
    return ftps_to_retire.difference(failed_ftps), len(failed_ftps)
Exemple #19
0
 def batch_retrieve( parameters ):  # need to handle parameters, ensure unicode
     return s3_retrieve( *parameters, raw_path=True ).decode( "utf8" ), \
            parameters[0]
 def s3_retrieve(self):
     return s3_retrieve(self.chunk_path, self.study.object_id)