""" Compare contents of all chunks matching this path, blow up if any values are different.
        (They should all be identical.) """
    collisions = {key: set() for key in keys}
    for key in keys:
        for chunk in ChunksRegistry(chunk_path=chunk_path):
            collisions[key].add(chunk[key])

    for key, collision in collisions.iteritems():
        if len(collision) > 1:
            print collisions
            raise Exception(
                "encountered bad duplicate chunk requiring manual purge.")


t1 = datetime.now()
for chunk in ChunksRegistry.iterator():
    if ChunksRegistry.count(chunk_path=chunk['chunk_path']) > 1:
        if chunk['chunk_path'] in duplicate_chunk_paths:
            print chunk['chunk_path']
        duplicate_chunk_paths.add(chunk['chunk_path'])
        duplicate_chunk_path_severity(chunk['chunk_path'])

t2 = datetime.now()
print "discovered %s duplicate chunks in %s seconds" % (
    len(duplicate_chunk_paths), (t2 - t1).total_seconds())

for path in duplicate_chunk_paths:
    while ChunksRegistry.count(chunk_path=path) > 1:
        ChunksRegistry(chunk_path=path)[0].remove()
        print "purging", path, ChunksRegistry.count(chunk_path=path)
Exemple #2
0
def migrate_chunk_registries():
    # Calculate the number of chunks that will be used to go through all of MChunkSet()
    
    d_chunk_list = []
    num_registries_handled = 0
    num_bulk_creates = 0
    for m_chunk in MChunkSet.iterator():
        
        with error_handler:
            try:
                d_study_info = study_id_dict[m_chunk.study_id]
            except KeyError:
                msg = 'Study {} referenced in chunk but does not exist, creating it.'.format(m_chunk['study_id'])
                print(msg)
                create_dummy_study(m_chunk.study_id)
                # raise NoSuchDatabaseObject(msg)
            try:
                d_user_info = user_id_dict[m_chunk.user_id]
            except KeyError:
                msg = 'User {} referenced in chunk but does not exist.'.format(m_chunk['user_id'])
                print(msg)
                continue
                # raise NoSuchDatabaseObject(msg)
            
            # some chunks have survey_ids that are string representations of objectids, fix.
            # (and sometimes this can be an empty string, handle that too.)
            if m_chunk.survey_id and isinstance(m_chunk.survey_id, (str, unicode)):
                m_chunk.survey_id = ObjectId(m_chunk.survey_id)
            
            if not m_chunk.survey_id:
                d_survey_pk = None
            elif m_chunk.survey_id in survey_id_dict:
                d_survey_pk = survey_id_dict[m_chunk.survey_id]['pk']
            else:
                print('Survey {} referenced in chunk but does not exist, creating it.'.format(m_chunk.survey_id))
                new_survey = create_dummy_survey(m_chunk.survey_id, m_chunk.study_id)
                d_survey_pk = new_survey.pk

            d_chunk = DChunk(
                is_chunkable=m_chunk.is_chunkable,
                chunk_path=m_chunk.chunk_path,
                chunk_hash=m_chunk.chunk_hash or '',
                data_type=m_chunk.data_type,
                time_bin=m_chunk.time_bin,
                study_id=d_study_info['pk'],
                participant_id=d_user_info['pk'],
                survey_id=d_survey_pk,
                deleted=d_study_info['deleted'],
            )
            
            # d_chunk.full_clean()  # Don't bother full cleaning, it is slow and unnecessary here.
            d_chunk_list.append(d_chunk)
            
            num_registries_handled += 1
            if num_registries_handled % CHUNK_SIZE == 0:
                # Every 10.000 database objects, bulk create and print to stdout
                num_bulk_creates += 1
                if num_bulk_creates % 10 == 0:
                    print(num_bulk_creates * CHUNK_SIZE)
                    
                # there are a lot of unique chunk path issues
                try:
                    DChunk.objects.bulk_create(d_chunk_list)
                except IntegrityError as e:
                    # This can't happen, because chunk_path does has unique=False at the time of the
                    # migration, and only has unique=True set later in a separate Django migration.
                    if "UNIQUE" in e.message:
                        for d_chunk in d_chunk_list:
                            if DChunk.objects.filter(chunk_path=d_chunk.chunk_path).exists():
                                try:
                                    print("duplicate path:",)
                                    duplicate_chunk_path_severity(d_chunk.chunk_path)
                                    print("...nevermind.")
                                except Exception as e2:
                                    print(d_chunk.chunk_path)
                                    print(e2.message)
                                    # raise e2
                            else:
                                d_chunk.save()
                    else:
                        raise e
                finally:
                    d_chunk_list = []
    
    DChunk.objects.bulk_create(d_chunk_list)