def reindex_all_files_to_process():
    """ Totally removes the FilesToProcess DB, deletes all chunked files on s3,
    clears the chunksregistry, and then adds all relevent files on s3 to the
    files to process registry. """
    FileProcessLock.lock()
    print str(datetime.now()), "purging FilesToProcess:", FilesToProcess.count()
    FileToProcess.db().drop()
    print str(datetime.now()), "purging existing ChunksRegistry", ChunksRegistry.count()
    ChunkRegistry.db().drop()

    pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2 )

    print str(datetime.now()), "deleting older chunked data:",
    CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER)
    print len(CHUNKED_DATA)
    pool.map(s3_delete, CHUNKED_DATA)
    del CHUNKED_DATA

    print str(datetime.now()), "pulling new files to process..."
    files_lists = pool.map(s3_list_files, [str(s._id) for s in Studies()] )
    print "putting new files to process..."
    for i,l in enumerate(files_lists):
        print str(datetime.now()), i+1, "of", str(Studies.count()) + ",", len(l), "files"
        for fp in l:
            if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS:
                FileToProcess.append_file_for_processing(fp, ObjectId(fp.split("/", 1)[0]), fp.split("/", 2)[1])
    del files_lists, l
    pool.close()
    pool.terminate()
    print str(datetime.now()), "processing data."
    FileProcessLock.unlock()
    process_file_chunks()
    """ Compare contents of all chunks matching this path, blow up if any values are different.
        (They should all be identical.) """
    collisions = {key: set() for key in keys}
    for key in keys:
        for chunk in ChunksRegistry(chunk_path=chunk_path):
            collisions[key].add(chunk[key])

    for key, collision in collisions.iteritems():
        if len(collision) > 1:
            print collisions
            raise Exception(
                "encountered bad duplicate chunk requiring manual purge.")


t1 = datetime.now()
for chunk in ChunksRegistry.iterator():
    if ChunksRegistry.count(chunk_path=chunk['chunk_path']) > 1:
        if chunk['chunk_path'] in duplicate_chunk_paths:
            print chunk['chunk_path']
        duplicate_chunk_paths.add(chunk['chunk_path'])
        duplicate_chunk_path_severity(chunk['chunk_path'])

t2 = datetime.now()
print "discovered %s duplicate chunks in %s seconds" % (
    len(duplicate_chunk_paths), (t2 - t1).total_seconds())

for path in duplicate_chunk_paths:
    while ChunksRegistry.count(chunk_path=path) > 1:
        ChunksRegistry(chunk_path=path)[0].remove()
        print "purging", path, ChunksRegistry.count(chunk_path=path)
Esempio n. 3
0
    print "migrate_upload_trackers..."
    migrate_upload_trackers()


if __name__ == '__main__':
    study_referents = {}
    study_id_dict = {}
    user_id_dict = {}
    survey_id_dict = {}
    
    orphaned_surveys = {}
    
    d_study_admin_list = []  # A list of study-researcher pairs
    d_study_survey_dict = {}  # A mapping of surveys to their associated studies
    d_study_settings_dict = {}  # A mapping of device settings to their associated studies

    CHUNK_SIZE = 10000
    
    # error_handler = ErrorHandler()
    error_handler = null_error_handler()
    
    print(MStudySet.count(), MSurveySet.count(), MSettingsSet.count(),
          MAdminSet.count(), MUserSet.count(), MChunkSet.count(), MUploadSet.count())
    with error_handler:
        run_all_migrations()
    print(DStudy.objects.count(), DSurvey.objects.count(), DSettings.objects.count(),
          DAdmin.objects.count(), DUser.objects.count(), DChunk.objects.count(), DUpload.objects.count())
    print("end:", datetime.now())
    
    error_handler.raise_errors()