コード例 #1
0
def assemble_deletable_files(sorted_data):
    deletable_file_paths = []

    for patient_id, expunge_start_date in sorted_data:
        participant = Participant.objects.get(patient_id=patient_id)

        # technically it is a datetime object
        expunge_start_date = convert_date(expunge_start_date)
        expunge_start_unix_timestamp = int(
            (expunge_start_date - UNIX_EPOCH_START).total_seconds()) * 1000

        prefix = str(participant.study.object_id) + "/" + patient_id + "/"
        s3_files = s3_list_files(prefix, as_generator=True)

        chunks_prefix = CHUNKS_FOLDER + "/" + prefix
        s3_chunks_files = s3_list_files(chunks_prefix, as_generator=True)

        raw_files = assemble_raw_files(s3_files, expunge_start_unix_timestamp)
        chunked_files = assemble_chunked_files(s3_chunks_files,
                                               expunge_start_date)

        print(
            patient_id, "timestamp: %s, (unixtime: %s): %s files" %
            (expunge_start_date, expunge_start_unix_timestamp / 1000,
             len(raw_files) + len(chunked_files)))

        deletable_file_paths.extend(raw_files)
        deletable_file_paths.extend(chunked_files)

    return deletable_file_paths
コード例 #2
0
def do_upload(file_paths_and_contents, data_type=None, forcibly_overwrite=False):
    if data_type == None: raise Exception("DATA TYPE!")
    upload_stream_map = { "survey_answers":("surveyAnswers", "csv"),
                          "audio":("voiceRecording", "mp4") }
    data_stream_string, file_extension = upload_stream_map[data_type]

    for timings_path, contents_and_timestamp in file_paths_and_contents.items():
        contents, timestamp = contents_and_timestamp
        study_id_string, user_id, _, survey_id, _ = timings_path.split("/")
        try:
            timestamp_string = str( int( mktime( timestamp.timetuple( ) ) ) ) + "000"
        except AttributeError:
            print "PROBLEM WITH TIMESTAMP FROM: %s" % timings_path
            continue
        if len(timestamp_string) != 13:
            raise Exception("LOL! No.")

        study_obj_id = Study(ObjectId(study_id_string))._id

        s3_file_path = "%s/%s/%s/%s/%s.%s" % (study_id_string,
                                              user_id,
                                              data_stream_string,
                                              survey_id,
                                              timestamp_string,
                                              file_extension)
        if len(s3_list_files(s3_file_path)) != 0:
            print "ALREADY_EXISTS: %s, %s" % (timings_path, s3_file_path)
            if forcibly_overwrite == False:
                continue
        else: print "yay!: ", s3_file_path
        contents = contents.encode("utf8") #maybe make this unicode-16?

        s3_upload(s3_file_path, contents, study_obj_id, raw_path=True)
        FileToProcess.append_file_for_processing( s3_file_path, study_obj_id, user_id )
コード例 #3
0
def reindex_all_files_to_process():
    """ Totally removes the FilesToProcess DB, deletes all chunked files on s3,
    clears the chunksregistry, and then adds all relevent files on s3 to the
    files to process registry. """
    FileProcessLock.lock()
    print str(datetime.now()), "purging FilesToProcess:", FilesToProcess.count()
    FileToProcess.db().drop()
    print str(datetime.now()), "purging existing ChunksRegistry", ChunksRegistry.count()
    ChunkRegistry.db().drop()

    pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2 )

    print str(datetime.now()), "deleting older chunked data:",
    CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER)
    print len(CHUNKED_DATA)
    pool.map(s3_delete, CHUNKED_DATA)
    del CHUNKED_DATA

    print str(datetime.now()), "pulling new files to process..."
    files_lists = pool.map(s3_list_files, [str(s._id) for s in Studies()] )
    print "putting new files to process..."
    for i,l in enumerate(files_lists):
        print str(datetime.now()), i+1, "of", str(Studies.count()) + ",", len(l), "files"
        for fp in l:
            if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS:
                FileToProcess.append_file_for_processing(fp, ObjectId(fp.split("/", 1)[0]), fp.split("/", 2)[1])
    del files_lists, l
    pool.close()
    pool.terminate()
    print str(datetime.now()), "processing data."
    FileProcessLock.unlock()
    process_file_chunks()
コード例 #4
0
def count_study_chunks():
    chunked_data = s3_list_files("CHUNKED_DATA")
    study_prefixes = { f[:38] for f in chunked_data }
    study_prefix_to_id = { study_prefix: ObjectId(study_prefix.split("/")[-2]) for study_prefix in study_prefixes }
    study_prefix_to_name= { study_prefix:Study(_id=study_id).name for study_prefix, study_id in study_prefix_to_id.items() }
    study_count = { study_prefix_to_name[study_prefix]: len([f for f in chunked_data if f[:38] == study_prefix]) for study_prefix in study_prefixes }
    return study_count
コード例 #5
0
def check_for_bad_chunks():
    """ This function runs through all chunkable data and checks for invalid file pointers
    to s3. """
    chunked_file_paths = set(s3_list_files("CHUNKED_DATA"))
    bad_chunks = []
    for entry in ChunkRegistry.objects.all():
        if entry.data_type in CHUNKABLE_FILES and entry.chunk_path not in chunked_file_paths:
            bad_chunks.append(entry)
    print("bad chunks:", len(bad_chunks))
コード例 #6
0
def reindex_all_files_to_process():
    """
    Totally clears the FilesToProcess DB, deletes all chunked files on S3,
    clears the ChunksRegistry DB, reads all relevant files on S3 to the
    FilesToProcess registry and then re-chunks them.
    """
    raise Exception(
        "This code has not been tested since converting database backends, that means 2018"
    )
    # Delete all preexisting FTP and ChunkRegistry objects
    FileProcessLock.lock()
    print('{!s} purging FileToProcess: {:d}'.format(
        datetime.now(), FileToProcess.objects.count()))
    FileToProcess.objects.all().delete()
    print('{!s} purging ChunkRegistry: {:d}'.format(
        datetime.now(), ChunkRegistry.objects.count()))
    ChunkRegistry.objects.all().delete()

    pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2)

    # Delete all preexisting chunked data files
    CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER)
    print('{!s} deleting older chunked data: {:d}'.format(
        datetime.now(), len(CHUNKED_DATA)))
    pool.map(s3_delete, CHUNKED_DATA)
    del CHUNKED_DATA

    # Get a list of all S3 files to replace in the database
    print('{!s} pulling new files to process...'.format(datetime.now()))
    files_lists = pool.map(s3_list_files,
                           Study.objects.values_list('object_id', flat=True))

    # For each such file, create an FTP object
    print("putting new files to process...")
    for i, l in enumerate(files_lists):
        print('{!s} {:d} of {:d}, {:d} files'.format(datetime.now(), i + 1,
                                                     Study.objects.count(),
                                                     len(l)))
        for fp in l:
            if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS:
                patient_id = fp.split('/', 2)[1]
                participant_pk = Participant.objects.filter(
                    patient_id=patient_id).values_list('pk', flat=True).get()
                FileToProcess.append_file_for_processing(
                    fp, fp.split("/", 1)[0], participant_id=participant_pk)

    # Clean up by deleting large variables, closing the thread pool and unlocking the file process lock
    del files_lists, l
    pool.close()
    pool.terminate()
    FileProcessLock.unlock()

    # Rechunk the newly created FTPs
    print("{!s} processing data.".format(datetime.now()))
    process_file_chunks()
コード例 #7
0
def grab_file_names(study_id, survey_id, user_id, number_points):
    """ Takes a list, returns a list of those most recent files."""
    # this is correct - we want to convert these values to strings, not coerce them, that causes them
    # to be converted to strings with a preceeding b and in single quotes.

    study_id = study_id if not isinstance(study_id,
                                          bytes) else study_id.decode()
    survey_id = survey_id if not isinstance(survey_id,
                                            bytes) else survey_id.decode()
    user_id = user_id if not isinstance(user_id, bytes) else user_id.decode()
    number_points = number_points if not isinstance(
        number_points, bytes) else number_points.decode()

    all_files = s3_list_files("%s/%s/surveyAnswers/%s" %
                              (str(study_id), str(user_id), str(survey_id)))
    return sorted(all_files[-number_points:])
コード例 #8
0
def count_study_chunks():
    chunked_file_paths = s3_list_files("CHUNKED_DATA")
    # The file paths start with CHUNKED_DATA/[24-digit object ID]
    study_prefixes = [f[:38] for f in chunked_file_paths]
    study_prefix_to_id = {
        study_prefix: study_prefix.split("/")[-2]
        for study_prefix in study_prefixes
    }
    study_prefix_to_name = {
        study_prefix: Study.objects.get(object_id=study_object_id).name
        for study_prefix, study_object_id in study_prefix_to_id.iteritems()
    }
    print(study_prefix_to_name)
    study_count = {
        study_prefix_to_name[study_prefix]:
        len([f for f in chunked_file_paths if f[:38] == study_prefix])
        for study_prefix in study_prefixes
    }
    return study_count
コード例 #9
0
    def reprocess_originals_from_chunk_path(cls, chunk_path):
        path_components = chunk_path.split("/")
        if len(path_components) != 5:
            raise Exception(
                "chunked file paths contain exactly 5 components separated by a slash."
            )

        chunk_files_text, study_obj_id, username, data_stream, timestamp = path_components

        if not chunk_files_text == CHUNKS_FOLDER:
            raise Exception(
                "This is not a chunked file, it is not in the chunked data folder."
            )

        participant = Participant.objects.get(patient_id=username)

        # data stream names are truncated
        full_data_stream = REVERSE_UPLOAD_FILE_TYPE_MAPPING[data_stream]

        # oh good, identifiers doesn't end in a slash.
        splitter_end_char = '_' if full_data_stream == IDENTIFIERS else '/'
        file_prefix = "/".join((
            study_obj_id,
            username,
            full_data_stream,
        )) + splitter_end_char
        print("searching:", file_prefix)

        # find all files with data from the appropriate time.
        dt_start = datetime.strptime(timestamp.strip(".csv"), API_TIME_FORMAT)
        dt_prev = dt_start - timedelta(hours=1)
        dt_end = dt_start + timedelta(hours=1)
        prior_hour_last_file = None
        file_paths_to_reprocess = []
        for s3_file_path in s3_list_files(file_prefix, as_generator=False):
            # convert timestamp....
            if full_data_stream == IDENTIFIERS:
                file_timestamp = float(
                    s3_file_path.rsplit(splitter_end_char)[-1][:-4])
            else:
                file_timestamp = float(
                    s3_file_path.rsplit(splitter_end_char)[-1][:-4]) / 1000
            file_dt = datetime.fromtimestamp(file_timestamp)
            # we need to get the last file from the prior hour as it my have relevant data,
            # fortunately returns of file paths are in ascending order, so it is the file
            # right before the rest of the data.  just cache it
            if dt_prev <= file_dt < dt_start:
                prior_hour_last_file = s3_file_path

            # and then every file within the relevant hour
            if dt_start <= file_dt <= dt_end:
                print("found:", s3_file_path)
                file_paths_to_reprocess.append(s3_file_path)

        # a "should be an unnecessary" safety check
        if prior_hour_last_file and prior_hour_last_file not in file_paths_to_reprocess:
            print("found:", prior_hour_last_file)
            file_paths_to_reprocess.append(prior_hour_last_file)

        if not prior_hour_last_file and not file_paths_to_reprocess:
            raise Exception(
                f"did not find any matching files: '{chunk_path}' using prefix '{file_prefix}'"
            )

        for fp in file_paths_to_reprocess:
            if cls.objects.filter(s3_file_path=fp).exists():
                print(f"{fp} is already queued for processing")
                continue
            else:
                print(f"Adding {fp} as a file to reprocess.")
                cls.append_file_for_processing(fp,
                                               study_obj_id,
                                               participant=participant)
コード例 #10
0
def grab_file_names(study_id, survey_id, user_id, number_points):
    """ Takes a list, returns a list of those most recent files."""
    all_files = s3_list_files("%s/%s/surveyAnswers/%s" %(str(study_id), str(user_id), str(survey_id)))
    return sorted( all_files[ -number_points: ] )
コード例 #11
0
if __name__ == "__main__":
    from os.path import abspath as _abspath
    import imp as _imp
    _current_folder_init = _abspath(__file__).rsplit('/', 1)[0]+ "/__init__.py"
    _imp.load_source("__init__", _current_folder_init)

from libs.s3 import s3_list_files
from db.data_access_models import FileToProcess, FilesToProcess
from bson import ObjectId

study_id_obj = ObjectId("5873fe38644ad7557b168e43")
study_id_str = str(study_id_obj)

for purgeable in FilesToProcess(user_id='prx7ap5x'):
    purgeable.remove()

for i, path in enumerate(s3_list_files(study_id_str , as_generator=True)):
    if i > 500:
        break
    if path[-3:] != 'csv':
        continue # skip if not a csv file...
    user_id = path[:-4].split('/')[1]
    path_sans_study = path.split("/", 1)[1]
    if FileToProcess(s3_file_path=path):
        print "%s already in FilesToProcess." % path
        continue
    FileToProcess.append_file_for_processing(path_sans_study, study_id_obj, user_id)