def reindex_specific_data_type(data_type):
    FileProcessLock.lock()
    print "starting..."
    #this line will raise an error if something is wrong with the data type
    file_name_key = data_stream_to_s3_file_name_string(data_type)
    relevant_chunks = ChunksRegistry(data_type=data_type)
    relevant_indexed_files = [ chunk["chunk_path"] for chunk in relevant_chunks ]
    print "purging old data..."
    for chunk in relevant_chunks: chunk.remove()

    pool = ThreadPool(20)
    pool.map(s3_delete, relevant_indexed_files)

    print "pulling files to process..."
    files_lists = pool.map(s3_list_files, [str(s._id) for s in Studies()] )
    for i,l in enumerate(files_lists):
        print str(datetime.now()), i+1, "of", str(Studies.count()) + ",", len(l), "files"
        for fp in l:
            if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS:
                FileToProcess.append_file_for_processing(fp, ObjectId(fp.split("/", 1)[0]), fp.split("/", 2)[1])
    del files_lists, l
    pool.close()
    pool.terminate()
    print str(datetime.now()), "processing data..."
    FileProcessLock.unlock()
    process_file_chunks()
    print "Done."
def reindex_all_files_to_process():
    """ Totally removes the FilesToProcess DB, deletes all chunked files on s3,
    clears the chunksregistry, and then adds all relevent files on s3 to the
    files to process registry. """
    FileProcessLock.lock()
    print str(datetime.now()), "purging FilesToProcess:", FilesToProcess.count()
    FileToProcess.db().drop()
    print str(datetime.now()), "purging existing ChunksRegistry", ChunksRegistry.count()
    ChunkRegistry.db().drop()

    pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2 )

    print str(datetime.now()), "deleting older chunked data:",
    CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER)
    print len(CHUNKED_DATA)
    pool.map(s3_delete, CHUNKED_DATA)
    del CHUNKED_DATA

    print str(datetime.now()), "pulling new files to process..."
    files_lists = pool.map(s3_list_files, [str(s._id) for s in Studies()] )
    print "putting new files to process..."
    for i,l in enumerate(files_lists):
        print str(datetime.now()), i+1, "of", str(Studies.count()) + ",", len(l), "files"
        for fp in l:
            if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS:
                FileToProcess.append_file_for_processing(fp, ObjectId(fp.split("/", 1)[0]), fp.split("/", 2)[1])
    del files_lists, l
    pool.close()
    pool.terminate()
    print str(datetime.now()), "processing data."
    FileProcessLock.unlock()
    process_file_chunks()
def completely_purge_study(study_id, actually_delete=False):
    if not isinstance(study_id, ObjectId):
        study_id = ObjectId(study_id)
    study = Study(study_id)

    surveys = study["surveys"]
    device_settings = study["device_settings"]
    users = Users(study_id=study_id)
    chunks = ChunksRegistry(study_id=study_id)
    files_to_process = FilesToProcess(study_id=study_id)
    if not actually_delete:
        print "if you actually delete this you will not be able to decrypt anything " \
              "from this study.  Don't do it unless you know what you are doing."
        print study.name
        # print len(study)
        # print len(device_settings)
        print len(surveys)
        print len(users)
        print len(chunks)
        print len(files_to_process)
    else:
        StudyDeviceSettings(device_settings).remove()
        [Survey(s).remove() for s in surveys]
        [User(u).remove() for u in users]
        [ChunkRegistry(c).remove() for c in chunks]
        [FileToProcess(f).remove() for f in files_to_process]
        study.remove()
def check_for_bad_chunks():
    """ This function runs through all chunkable data and checks for invalid file pointers
    to s3. """
    chunked_data = set(s3_list_files("CHUNKED_DATA"))
    bad_chunks = []
    for entry in ChunksRegistry():
        if entry.data_type in CHUNKABLE_FILES and entry.chunk_path not in chunked_data:
            bad_chunks.append(entry)
    print "bad chunks:", len(bad_chunks)
def duplicate_chunk_path_severity(chunk_path):
    """ Compare contents of all chunks matching this path, blow up if any values are different.
        (They should all be identical.) """
    collisions = {key: set() for key in keys}
    for key in keys:
        for chunk in ChunksRegistry(chunk_path=chunk_path):
            collisions[key].add(chunk[key])

    for key, collision in collisions.iteritems():
        if len(collision) > 1:
            print collisions
            raise Exception(
                "encountered bad duplicate chunk requiring manual purge.")
def handle_database_query(study_id, query, registry=None):
    """ Runs the database query as a generator/iterator. """
    chunks = ChunksRegistry.get_chunks_time_range(study_id, **query)
    # no registry, just return one by one.
    if not registry:
        return chunks

    # yes registry, we need to filter and then yield
    else:

        def do_filtered_query():
            for chunk in chunks:
                if (chunk['chunk_path'] in registry and
                        registry[chunk['chunk_path']] == chunk["chunk_hash"]):
                    continue
                else:
                    yield chunk

        return do_filtered_query()
    """ Compare contents of all chunks matching this path, blow up if any values are different.
        (They should all be identical.) """
    collisions = {key: set() for key in keys}
    for key in keys:
        for chunk in ChunksRegistry(chunk_path=chunk_path):
            collisions[key].add(chunk[key])

    for key, collision in collisions.iteritems():
        if len(collision) > 1:
            print collisions
            raise Exception(
                "encountered bad duplicate chunk requiring manual purge.")


t1 = datetime.now()
for chunk in ChunksRegistry.iterator():
    if ChunksRegistry.count(chunk_path=chunk['chunk_path']) > 1:
        if chunk['chunk_path'] in duplicate_chunk_paths:
            print chunk['chunk_path']
        duplicate_chunk_paths.add(chunk['chunk_path'])
        duplicate_chunk_path_severity(chunk['chunk_path'])

t2 = datetime.now()
print "discovered %s duplicate chunks in %s seconds" % (
    len(duplicate_chunk_paths), (t2 - t1).total_seconds())

for path in duplicate_chunk_paths:
    while ChunksRegistry.count(chunk_path=path) > 1:
        ChunksRegistry(chunk_path=path)[0].remove()
        print "purging", path, ChunksRegistry.count(chunk_path=path)
Exemple #8
0
def migrate_chunk_registries():
    # Calculate the number of chunks that will be used to go through all of MChunkSet()
    
    d_chunk_list = []
    num_registries_handled = 0
    num_bulk_creates = 0
    for m_chunk in MChunkSet.iterator():
        
        with error_handler:
            try:
                d_study_info = study_id_dict[m_chunk.study_id]
            except KeyError:
                msg = 'Study {} referenced in chunk but does not exist, creating it.'.format(m_chunk['study_id'])
                print(msg)
                create_dummy_study(m_chunk.study_id)
                # raise NoSuchDatabaseObject(msg)
            try:
                d_user_info = user_id_dict[m_chunk.user_id]
            except KeyError:
                msg = 'User {} referenced in chunk but does not exist.'.format(m_chunk['user_id'])
                print(msg)
                continue
                # raise NoSuchDatabaseObject(msg)
            
            # some chunks have survey_ids that are string representations of objectids, fix.
            # (and sometimes this can be an empty string, handle that too.)
            if m_chunk.survey_id and isinstance(m_chunk.survey_id, (str, unicode)):
                m_chunk.survey_id = ObjectId(m_chunk.survey_id)
            
            if not m_chunk.survey_id:
                d_survey_pk = None
            elif m_chunk.survey_id in survey_id_dict:
                d_survey_pk = survey_id_dict[m_chunk.survey_id]['pk']
            else:
                print('Survey {} referenced in chunk but does not exist, creating it.'.format(m_chunk.survey_id))
                new_survey = create_dummy_survey(m_chunk.survey_id, m_chunk.study_id)
                d_survey_pk = new_survey.pk

            d_chunk = DChunk(
                is_chunkable=m_chunk.is_chunkable,
                chunk_path=m_chunk.chunk_path,
                chunk_hash=m_chunk.chunk_hash or '',
                data_type=m_chunk.data_type,
                time_bin=m_chunk.time_bin,
                study_id=d_study_info['pk'],
                participant_id=d_user_info['pk'],
                survey_id=d_survey_pk,
                deleted=d_study_info['deleted'],
            )
            
            # d_chunk.full_clean()  # Don't bother full cleaning, it is slow and unnecessary here.
            d_chunk_list.append(d_chunk)
            
            num_registries_handled += 1
            if num_registries_handled % CHUNK_SIZE == 0:
                # Every 10.000 database objects, bulk create and print to stdout
                num_bulk_creates += 1
                if num_bulk_creates % 10 == 0:
                    print(num_bulk_creates * CHUNK_SIZE)
                    
                # there are a lot of unique chunk path issues
                try:
                    DChunk.objects.bulk_create(d_chunk_list)
                except IntegrityError as e:
                    # This can't happen, because chunk_path does has unique=False at the time of the
                    # migration, and only has unique=True set later in a separate Django migration.
                    if "UNIQUE" in e.message:
                        for d_chunk in d_chunk_list:
                            if DChunk.objects.filter(chunk_path=d_chunk.chunk_path).exists():
                                try:
                                    print("duplicate path:",)
                                    duplicate_chunk_path_severity(d_chunk.chunk_path)
                                    print("...nevermind.")
                                except Exception as e2:
                                    print(d_chunk.chunk_path)
                                    print(e2.message)
                                    # raise e2
                            else:
                                d_chunk.save()
                    else:
                        raise e
                finally:
                    d_chunk_list = []
    
    DChunk.objects.bulk_create(d_chunk_list)
Exemple #9
0
    print "migrate_upload_trackers..."
    migrate_upload_trackers()


if __name__ == '__main__':
    study_referents = {}
    study_id_dict = {}
    user_id_dict = {}
    survey_id_dict = {}
    
    orphaned_surveys = {}
    
    d_study_admin_list = []  # A list of study-researcher pairs
    d_study_survey_dict = {}  # A mapping of surveys to their associated studies
    d_study_settings_dict = {}  # A mapping of device settings to their associated studies

    CHUNK_SIZE = 10000
    
    # error_handler = ErrorHandler()
    error_handler = null_error_handler()
    
    print(MStudySet.count(), MSurveySet.count(), MSettingsSet.count(),
          MAdminSet.count(), MUserSet.count(), MChunkSet.count(), MUploadSet.count())
    with error_handler:
        run_all_migrations()
    print(DStudy.objects.count(), DSurvey.objects.count(), DSettings.objects.count(),
          DAdmin.objects.count(), DUser.objects.count(), DChunk.objects.count(), DUpload.objects.count())
    print("end:", datetime.now())
    
    error_handler.raise_errors()