def reindex_all_files_to_process():
    """ Totally removes the FilesToProcess DB, deletes all chunked files on s3,
    clears the chunksregistry, and then adds all relevent files on s3 to the
    files to process registry. """
    FileProcessLock.lock()
    print str(datetime.now()), "purging FilesToProcess:", FilesToProcess.count()
    FileToProcess.db().drop()
    print str(datetime.now()), "purging existing ChunksRegistry", ChunksRegistry.count()
    ChunkRegistry.db().drop()

    pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2 )

    print str(datetime.now()), "deleting older chunked data:",
    CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER)
    print len(CHUNKED_DATA)
    pool.map(s3_delete, CHUNKED_DATA)
    del CHUNKED_DATA

    print str(datetime.now()), "pulling new files to process..."
    files_lists = pool.map(s3_list_files, [str(s._id) for s in Studies()] )
    print "putting new files to process..."
    for i,l in enumerate(files_lists):
        print str(datetime.now()), i+1, "of", str(Studies.count()) + ",", len(l), "files"
        for fp in l:
            if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS:
                FileToProcess.append_file_for_processing(fp, ObjectId(fp.split("/", 1)[0]), fp.split("/", 2)[1])
    del files_lists, l
    pool.close()
    pool.terminate()
    print str(datetime.now()), "processing data."
    FileProcessLock.unlock()
    process_file_chunks()
def completely_purge_study(study_id, actually_delete=False):
    if not isinstance(study_id, ObjectId):
        study_id = ObjectId(study_id)
    study = Study(study_id)

    surveys = study["surveys"]
    device_settings = study["device_settings"]
    users = Users(study_id=study_id)
    chunks = ChunksRegistry(study_id=study_id)
    files_to_process = FilesToProcess(study_id=study_id)
    if not actually_delete:
        print "if you actually delete this you will not be able to decrypt anything " \
              "from this study.  Don't do it unless you know what you are doing."
        print study.name
        # print len(study)
        # print len(device_settings)
        print len(surveys)
        print len(users)
        print len(chunks)
        print len(files_to_process)
    else:
        StudyDeviceSettings(device_settings).remove()
        [Survey(s).remove() for s in surveys]
        [User(u).remove() for u in users]
        [ChunkRegistry(c).remove() for c in chunks]
        [FileToProcess(f).remove() for f in files_to_process]
        study.remove()
Exemple #3
0
def get_user_list_safely(retries=10):
    """ This error started occurring on occasionally on Mar 22, 2017, we don't know why. """
    try:
        return set(FilesToProcess(field="user_id"))
    except CursorNotFound:
        if retries < 1:
            raise
        print "encountered cursor error, retrying..."
        sleep(0.1)
        return get_user_list_safely(retries=retries - 1)
def process_file_chunks():
    """ This is the function that is called from cron.  It runs through all new files that have
    been uploaded and 'chunks' them. Handles logic for skipping bad files, raising errors
    appropriately. """
    error_handler = ErrorHandler()
    if FileProcessLock.islocked():
        raise ProcessingOverlapError(
            "Data processing overlapped with a previous data indexing run.")
    FileProcessLock.lock()
    number_bad_files = 0

    user_ids = set(FilesToProcess(field="user_id"))
    print "processing files for the following users: %s" % ",".join(user_ids)
    for user_id in user_ids:
        while True:
            previous_number_bad_files = number_bad_files
            starting_length = FilesToProcess.count(user_id=user_id)

            print str(datetime.now()), "processing %s, %s files remaining" % (
                user_id, starting_length)

            number_bad_files += do_process_user_file_chunks(
                count=FILE_PROCESS_PAGE_SIZE,
                error_handler=error_handler,
                skip_count=number_bad_files,
                user_id=user_id)

            if starting_length == FilesToProcess.count(
                    user_id=user_id):  #zero files processed
                if previous_number_bad_files == number_bad_files:
                    # Cases:
                    #   every file broke, might as well fail here, and would cause infinite loop otherwise.
                    #   no new files.
                    break
                else:
                    continue
    FileProcessLock.unlock()
    error_handler.raise_errors()
    raise EverythingWentFine(DATA_PROCESSING_NO_ERROR_STRING)
Exemple #5
0
def celery_process_file_chunks(user_id):
    """ This is the function that is called from cron.  It runs through all new files that have
    been uploaded and 'chunks' them. Handles logic for skipping bad files, raising errors
    appropriately. """
    log = LogList()
    number_bad_files = 0
    error_sentry = ErrorSentry(SENTRY_DSN,
                               sentry_client_kwargs={
                                   "tags": {
                                       "user_id": user_id
                                   },
                                   'transport': HTTPTransport
                               })
    log.append("processing files for %s" % user_id)

    while True:
        previous_number_bad_files = number_bad_files
        starting_length = FilesToProcess.count(user_id=user_id)

        log.append(
            str(datetime.now()) + " processing %s, %s files remaining" %
            (user_id, starting_length))
        number_bad_files += do_process_user_file_chunks(
            count=FILE_PROCESS_PAGE_SIZE,
            error_handler=error_sentry,
            skip_count=number_bad_files,
            user_id=user_id)

        if starting_length == FilesToProcess.count(
                user_id=user_id):  # zero files processed
            if previous_number_bad_files == number_bad_files:
                # Cases:
                #   every file broke, blow up. (would cause infinite loop otherwise)
                #   no new files.
                break
            else:
                continue
Exemple #6
0
        "Go check on reindex operation.",
        source_email="*****@*****.**")


for study_id in studies:
    if isinstance(study_id, (str, unicode)):
        study_id = ObjectId(study_id)
    study = Study(study_id)
    print "============================================================="
    print "============================================================="
    print "============================================================="
    print "starting on %s, study id: %s" % (study.name, str(study_id))
    print "============================================================="
    print "============================================================="
    print "============================================================="

    study_id = ObjectId(study_id)

    try:
        reindex_study(study_id)
    except Exception as e:
        process_file_chunks(
        )  #will raise an error if things fail on second attempt

    if FilesToProcess.count() != 0:
        do_email(study)
        raise Exception("stopped on " + str(study_id))

email_system_administrators("OMG IT FINISHED AND EVERYTHING IS DONE.",
                            "Go git checkout .; touch wsgi.py",
                            source_email="*****@*****.**")
if __name__ == "__main__":
    from os.path import abspath as _abspath
    import imp as _imp
    _current_folder_init = _abspath(__file__).rsplit('/', 1)[0]+ "/__init__.py"
    _imp.load_source("__init__", _current_folder_init)

from libs.s3 import s3_list_files
from db.data_access_models import FileToProcess, FilesToProcess
from bson import ObjectId

study_id_obj = ObjectId("5873fe38644ad7557b168e43")
study_id_str = str(study_id_obj)

for purgeable in FilesToProcess(user_id='prx7ap5x'):
    purgeable.remove()

for i, path in enumerate(s3_list_files(study_id_str , as_generator=True)):
    if i > 500:
        break
    if path[-3:] != 'csv':
        continue # skip if not a csv file...
    user_id = path[:-4].split('/')[1]
    path_sans_study = path.split("/", 1)[1]
    if FileToProcess(s3_file_path=path):
        print "%s already in FilesToProcess." % path
        continue
    FileToProcess.append_file_for_processing(path_sans_study, study_id_obj, user_id)
def do_process_user_file_chunks(count, error_handler, skip_count, user_id):
    """
    Run through the files to process, pull their data, put it into s3 bins. Run the file through
    the appropriate logic path based on file type.

    If a file is empty put its ftp object to the empty_files_list, we can't delete objects
    in-place while iterating over the db.

    All files except for the audio recording files are in the form of CSVs, most of those files
    can be separated by "time bin" (separated into one-hour chunks) and concatenated and sorted
    trivially. A few files, call log, identifier file, and wifi log, require some triage
    beforehand.  The debug log cannot be correctly sorted by time for all elements, because it
    was not actually expected to be used by researchers, but is apparently quite useful.

    Any errors are themselves concatenated using the passed in error handler.
    """
    #this is how you declare a defaultdict containing a tuple of two deques.
    all_binified_data = defaultdict(lambda: (deque(), deque()))
    ftps_to_remove = set([])
    pool = ThreadPool(CONCURRENT_NETWORK_OPS)
    survey_id_dict = {}

    for data in pool.map(batch_retrieve_for_processing,
                         FilesToProcess(page_size=count + skip_count,
                                        user_id=user_id)[skip_count:],
                         chunksize=1):
        with error_handler:
            #raise errors that we encountered in the s3 access threaded operations to the error_handler
            if data['exception']:
                print "\n" + data['ftp']['s3_file_path']
                print data['traceback']
                raise data['exception']

            if data['chunkable']:
                # print "1a"
                newly_binified_data, survey_id_hash = process_csv_data(data)
                # print data, "\n1b"
                if data['data_type'] in SURVEY_DATA_FILES:
                    # print survey_id_hash
                    survey_id_dict[
                        survey_id_hash] = resolve_survey_id_from_file_name(
                            data['ftp']["s3_file_path"])
                if newly_binified_data:
                    # print "1c"
                    append_binified_csvs(all_binified_data,
                                         newly_binified_data, data['ftp'])
                else:  # delete empty files from FilesToProcess
                    # print "1d"
                    ftps_to_remove.add(data['ftp']._id)
                continue

            else:  #if not data['chunkable']
                # print "2a"
                timestamp = clean_java_timecode(
                    data['ftp']["s3_file_path"].rsplit("/", 1)[-1][:-4])
                # print "2a"
                ChunkRegistry.add_new_chunk(data['ftp']["study_id"],
                                            data['ftp']["user_id"],
                                            data['data_type'],
                                            data['ftp']["s3_file_path"],
                                            timestamp)
                # print "2b"
                ftps_to_remove.add(data['ftp']._id)

    pool.close()
    pool.terminate()
    # print 3
    more_ftps_to_remove, number_bad_files = upload_binified_data(
        all_binified_data, error_handler, survey_id_dict)
    # print "X"
    ftps_to_remove.update(more_ftps_to_remove)
    for ftp_id in ftps_to_remove:
        FileToProcess(ftp_id).remove()
    # print "Y"
    gc.collect()
    # print "Z"
    return number_bad_files