def reindex_specific_data_type(data_type):
    FileProcessLock.lock()
    print "starting..."
    #this line will raise an error if something is wrong with the data type
    file_name_key = data_stream_to_s3_file_name_string(data_type)
    relevant_chunks = ChunksRegistry(data_type=data_type)
    relevant_indexed_files = [ chunk["chunk_path"] for chunk in relevant_chunks ]
    print "purging old data..."
    for chunk in relevant_chunks: chunk.remove()

    pool = ThreadPool(20)
    pool.map(s3_delete, relevant_indexed_files)

    print "pulling files to process..."
    files_lists = pool.map(s3_list_files, [str(s._id) for s in Studies()] )
    for i,l in enumerate(files_lists):
        print str(datetime.now()), i+1, "of", str(Studies.count()) + ",", len(l), "files"
        for fp in l:
            if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS:
                FileToProcess.append_file_for_processing(fp, ObjectId(fp.split("/", 1)[0]), fp.split("/", 2)[1])
    del files_lists, l
    pool.close()
    pool.terminate()
    print str(datetime.now()), "processing data..."
    FileProcessLock.unlock()
    process_file_chunks()
    print "Done."
def reindex_all_files_to_process():
    """ Totally removes the FilesToProcess DB, deletes all chunked files on s3,
    clears the chunksregistry, and then adds all relevent files on s3 to the
    files to process registry. """
    FileProcessLock.lock()
    print str(datetime.now()), "purging FilesToProcess:", FilesToProcess.count()
    FileToProcess.db().drop()
    print str(datetime.now()), "purging existing ChunksRegistry", ChunksRegistry.count()
    ChunkRegistry.db().drop()

    pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2 )

    print str(datetime.now()), "deleting older chunked data:",
    CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER)
    print len(CHUNKED_DATA)
    pool.map(s3_delete, CHUNKED_DATA)
    del CHUNKED_DATA

    print str(datetime.now()), "pulling new files to process..."
    files_lists = pool.map(s3_list_files, [str(s._id) for s in Studies()] )
    print "putting new files to process..."
    for i,l in enumerate(files_lists):
        print str(datetime.now()), i+1, "of", str(Studies.count()) + ",", len(l), "files"
        for fp in l:
            if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS:
                FileToProcess.append_file_for_processing(fp, ObjectId(fp.split("/", 1)[0]), fp.split("/", 2)[1])
    del files_lists, l
    pool.close()
    pool.terminate()
    print str(datetime.now()), "processing data."
    FileProcessLock.unlock()
    process_file_chunks()
Beispiel #3
0
    print "migrate_upload_trackers..."
    migrate_upload_trackers()


if __name__ == '__main__':
    study_referents = {}
    study_id_dict = {}
    user_id_dict = {}
    survey_id_dict = {}
    
    orphaned_surveys = {}
    
    d_study_admin_list = []  # A list of study-researcher pairs
    d_study_survey_dict = {}  # A mapping of surveys to their associated studies
    d_study_settings_dict = {}  # A mapping of device settings to their associated studies

    CHUNK_SIZE = 10000
    
    # error_handler = ErrorHandler()
    error_handler = null_error_handler()
    
    print(MStudySet.count(), MSurveySet.count(), MSettingsSet.count(),
          MAdminSet.count(), MUserSet.count(), MChunkSet.count(), MUploadSet.count())
    with error_handler:
        run_all_migrations()
    print(DStudy.objects.count(), DSurvey.objects.count(), DSettings.objects.count(),
          DAdmin.objects.count(), DUser.objects.count(), DChunk.objects.count(), DUpload.objects.count())
    print("end:", datetime.now())
    
    error_handler.raise_errors()