Example #1
0
 def clear_missing(self, verbose=False):
     conn = Solr(settings.SOLR_URL)
     start = 0
     to_delete = []
     pb = None
     if verbose: print "Checking for indexed records no longer in database"
     while True:
         if verbose and pb: pb.update(start)
         result = conn.search('*:*',
                              sort='id asc',
                              start=start,
                              rows=500,
                              fields=['id'])
         if not result:
             break
         if verbose and not pb: pb = ProgressBar(result.hits)
         ids = [int(r['id']) for r in result]
         records = Record.objects.filter(id__in=ids).values_list('id',
                                                                 flat=True)
         for r in records:
             ids.remove(r)
         to_delete.extend(ids)
         start += 500
     if verbose and pb: pb.done()
     pb = None
     if verbose and to_delete:
         print "Removing unneeded records from index"
         pb = ProgressBar(len(to_delete))
     while to_delete:
         if verbose and pb: pb.update(pb.total - len(to_delete))
         conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete[:500])))
         to_delete = to_delete[500:]
     if verbose and pb: pb.done()
Example #2
0
    def handle(self, *args, **kwargs):

        data_file = kwargs.get('data_file')
        collections = map(int, kwargs.get('collections') or list())
        separator = kwargs.get('separator')

        fields = list(
            Field.objects
            .filter(fieldvalue__record__collection__in=collections)
            .distinct()
        )

        with open(data_file, 'w') as csvfile:

            fieldnames = [field.full_name for field in fields]
            fieldnames.extend(['__file__', '__path__'])

            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            records = Record.objects.filter(collection__in=collections)
            pb = ProgressBar(records.count())

            for count, record in enumerate(records):

                values = record.get_fieldvalues()
                media = list(record.media_set.select_related('storage').all())

                while values or media:
                    row = dict()
                    extra_values = list()

                    for value in values:
                        fieldname = value.field.full_name
                        v = value.value.encode('utf8')
                        if fieldname in row:
                            if not separator:
                                extra_values.append(value)
                            else:
                                row[fieldname] += separator + v
                        else:
                            row[fieldname] = v

                    if media:
                        m = media.pop()
                        row['__file__'] = m.url
                        row['__path__'] = m.storage.base

                    writer.writerow(row)
                    values = extra_values

                pb.update(count)

            pb.done()
Example #3
0
 def clear_missing(self, verbose=False):
     conn = Solr(settings.SOLR_URL)
     start = 0
     to_delete = []
     pb = None
     if verbose: print "Checking for indexed records no longer in database"
     while True:
         if verbose and pb: pb.update(start)
         result = conn.search('*:*', sort='id asc', start=start, rows=500, fields=['id'])
         if not result:
             break
         if verbose and not pb: pb = ProgressBar(result.hits)
         ids = [int(r['id']) for r in result]
         records = Record.objects.filter(id__in=ids).values_list('id', flat=True)
         for r in records:
             ids.remove(r)
         to_delete.extend(ids)
         start += 500
     if verbose and pb: pb.done()
     pb = None
     if verbose and to_delete:
         print "Removing unneeded records from index"
         pb = ProgressBar(len(to_delete))
     while to_delete:
         if verbose and pb: pb.update(pb.total - len(to_delete))
         conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete[:500])))
         to_delete = to_delete[500:]
     if verbose and pb: pb.done()
Example #4
0
    def handle(self, *args, **kwargs):

        updated = 0
        pb = ProgressBar(Media.objects.count())
        for count, media in enumerate(Media.objects.all()):
            name = slugify(os.path.splitext(os.path.basename(media.url))[0])
            if name != media.name:
                media.name = name
                media.save(force_update_name=True)
                updated += 1

            pb.update(count)

        pb.done()

        print "Updated %d media objects" % updated
Example #5
0
    def index(self, verbose=False, all=False, collections=None):
        from models import SolrIndexUpdates
        self._build_group_tree()
        core_fields = dict(
            (f, f.get_equivalent_fields())
            for f in Field.objects.filter(standard__prefix='dc'))
        # add VRA Title to support work titles
        try:
            vra_title = Field.objects.get(name='title', standard__prefix='vra')
            core_fields[vra_title] = vra_title.get_equivalent_fields()
        except Field.DoesNotExist:
            pass
        count = 0
        batch_size = 100
        process_thread = None
        if all:
            query = Record.objects.all()
            if collections:
                query = query.filter(collection__in=collections)
            total_count = query.count()
            to_update = None
            to_delete = None
        else:
            processed_updates = []
            to_update = []
            to_delete = []
            updates = SolrIndexUpdates.objects.all()[:batch_size].values_list(
                'id', 'record', 'delete')
            for id, record, delete in updates:
                processed_updates.append(id)
                if delete:
                    to_delete.append(record)
                else:
                    to_update.append(record)
            total_count = len(to_update)

        if not all and not to_update and not to_delete:
            logger.info("Nothing to update in index, returning early")
            return 0

        conn = Solr(settings.SOLR_URL)
        if to_delete:
            conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete)))

        primary_work_record_manager = PrimaryWorkRecordManager()

        if verbose:
            pb = ProgressBar(total_count)

        def get_method(method):
            module, _, function = method.rpartition('.')
            try:
                __import__(module)
                mod = sys.modules[module]
                return getattr(mod, function)
            except Exception, ex:
                logging.debug(
                    "Could not import custom Solr record indexer %s: %s",
                    method, ex)
Example #6
0
    def remove(self):

        common = self.check()

        print "Removing unneeded media objects"

        pb = ProgressBar(len(common))
        count = 0

        for id in common:
            m = Media.objects.filter(record__id=id)
            m.filter(url__startswith='medium\\').delete()
            m.filter(url__startswith='thumb\\').delete()
            count += 1
            pb.update(count)

        pb.done()
 def handle(self, *prefix, **options):
     if not prefix:
         print self.help
     else:
         count = updated = 0
         total = Media.objects.count()
         pb = ProgressBar(total)
         for i in range(0, total, 100):
             for media in Media.objects.all()[i:i+100]:
                 if media.url.startswith(prefix):
                     media.url = media.url[len(prefix):]
                     media.save()
                     updated += 1
                 count += 1
                 pb.update(count)
             reset_queries()
         pb.done()
         print "Updated %d/%d media objects" % (updated, count)
    def handle(self, *args, **kwargs):

        updated = 0

        id_fields = standardfield('identifier', equiv=True)
        titles = FieldValue.objects.select_related('record').filter(field__in=id_fields)
        pb = ProgressBar(titles.count())

        for count, title in enumerate(titles):
            name = slugify(title.value)
            if name != title.record.name:
                title.record.name = name
                title.record.save(force_update_name=True)
                updated += 1

            pb.update(count)

        pb.done()

        print "Updated %d record objects" % updated
Example #9
0
    def handle(self, *args, **kwargs):

        coll = kwargs.get('collection')

        if not coll:
            print "--collection is a required parameter"
            return

        if coll.isdigit():
            collection = Collection.objects.get(id=coll)
        else:
            collection = Collection.objects.get(name=coll)

        admins = User.objects.filter(is_superuser=True)
        if admins:
            admin = admins[0]
        else:
            admin = None

        pb = ProgressBar(collection.records.count())
        for count, record in enumerate(collection.records.all()):

            get_thumbnail_for_record(record, admin)
            get_thumbnail_for_record(record, admin, crop_to_square=True)

            pb.update(count)

        pb.done()
Example #10
0
 def refresh(self):
     count = 0
     total = Media.objects.count()
     pb = ProgressBar(total)
     for i in range(0, total, 1000):
         for media in Media.objects.all()[i:i+1000]:
             media.identify()
             count += 1
             pb.update(count)
         reset_queries()
     pb.done()
Example #11
0
    def handle(self, *args, **kwargs):

        data_file = kwargs.get('data_file')
        collections = map(int, kwargs.get('collections') or list())
        separator = kwargs.get('separator')

        fields = list(
            Field.objects
            .filter(fieldvalue__record__collection__in=collections)
            .distinct()
        )

        with open(data_file, 'w') as csvfile:

            fieldnames = [field.full_name for field in fields]
            fieldnames.extend(['__file__', '__path__'])

            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            records = Record.objects.filter(collection__in=collections)
            pb = ProgressBar(records.count())

            for count, record in enumerate(records):

                values = record.get_fieldvalues()
                media = list(record.media_set.select_related('storage').all())

                while values or media:
                    row = dict()
                    extra_values = list()

                    for value in values:
                        fieldname = value.field.full_name
                        v = value.value
                        if fieldname in row:
                            if not separator:
                                extra_values.append(value)
                            else:
                                row[fieldname] += separator + v
                        else:
                            row[fieldname] = v

                    if media:
                        m = media.pop()
                        row['__file__'] = m.url
                        row['__path__'] = m.storage.base

                    writer.writerow(row)
                    values = extra_values

                pb.update(count)

            pb.done()
Example #12
0
    def handle(self, *args, **kwargs):

        updated = 0
        pb = ProgressBar(Media.objects.count())
        for count, media in enumerate(Media.objects.all()):
            name = slugify(os.path.splitext(os.path.basename(media.url))[0])
            if name != media.name:
                media.name = name
                media.save(force_update_name=True)
                updated += 1

            pb.update(count)

        pb.done()

        print "Updated %d media objects" % updated
    def remove(self):

        common = self.check()

        print "Removing unneeded media objects"

        pb = ProgressBar(len(common))
        count = 0

        for id in common:
            m = Media.objects.filter(record__id=id)
            m.filter(url__startswith='medium\\').delete()
            m.filter(url__startswith='thumb\\').delete()
            count += 1
            pb.update(count)

        pb.done()
Example #14
0
 def handle(self, *prefix, **options):
     if not prefix:
         print(self.help)
     else:
         count = updated = 0
         total = Media.objects.count()
         pb = ProgressBar(total)
         for i in range(0, total, 100):
             for media in Media.objects.all()[i:i + 100]:
                 if media.url.startswith(prefix):
                     media.url = media.url[len(prefix):]
                     media.save()
                     updated += 1
                 count += 1
                 pb.update(count)
             reset_queries()
         pb.done()
         print("Updated %d/%d media objects" % (updated, count))
    def handle(self, *args, **kwargs):

        updated = 0

        id_fields = standardfield('identifier', equiv=True)
        titles = FieldValue.objects.select_related('record').filter(
            field__in=id_fields)
        pb = ProgressBar(titles.count())

        for count, title in enumerate(titles):
            name = slugify(title.value)
            if name != title.record.name:
                title.record.name = name
                title.record.save(force_update_name=True)
                updated += 1

            pb.update(count)

        pb.done()

        print("Updated %d record objects" % updated)
    def handle(self, from_collection, to_collections, commit, *args,
               **options):

        if not from_collection or not to_collections:
            print("Error: Must specify --from and --to arguments")
            return

        print("Mapping presentation items from collection %s to " \
            "collection(s) %s" % (from_collection, to_collections))

        idfields = standardfield_ids('identifier', equiv=True)

        print("Fetching identifiers")

        query = FieldValue.objects.filter(
            field__in=idfields,
            record__collectionitem__collection=from_collection,
            owner=None,
            context_type=None,
            hidden=False).values_list('value', 'record')

        record_to_id = dict()
        for identifier, record in query:
            record_to_id.setdefault(record, []).append(identifier)

        print("Fetching target records")

        query = FieldValue.objects.filter(
            field__in=idfields,
            record__collectionitem__collection__in=to_collections,
            owner=None,
            context_type=None,
            hidden=False).values_list('value', 'record')

        id_to_record = dict()

        for identifier, record in query:
            id_to_record.setdefault(identifier, []).append(record)

        print("Mapping presentation items")
        remapped = 0
        errors = []

        items = PresentationItem.objects.filter(
            record__collectionitem__collection=from_collection)
        pb = ProgressBar(len(items))

        for count, item in enumerate(items):
            identifiers = record_to_id.get(item.record_id)
            if identifiers:
                for identifier in identifiers:
                    new_records = id_to_record.get(identifier)
                    if new_records:
                        if len(new_records) == 1:
                            remapped += 1
                            if commit:
                                item.record_id = new_records[0]
                                item.save()
                            break
                        else:
                            errors.append(
                                "Multiple matching records with identifier "
                                "'%s' found in collection %s: %s" %
                                (identifier, to_collections,
                                 sorted(new_records)))
                    else:
                        errors.append(
                            "No record with identifier '%s' found in "
                            "collection %s" % (identifier, to_collections))
            else:
                errors.append("No identifier found for record %s" %
                              item.record_id)
            pb.update(count)

        pb.done()

        errors = sorted(set(errors))

        if commit:
            print("Remapped %s items" % remapped)
        else:
            print("Would have remapped %s items - rerun with --commit" % \
                remapped)
        if errors:
            print("%s unique errors occurred:" % len(errors))
            print('\n'.join(errors))
Example #17
0
    def index(self, verbose=False, all=False):
        from models import SolrIndexUpdates
        self._build_group_tree()
        conn = Solr(settings.SOLR_URL)
        core_fields = dict((f, f.get_equivalent_fields()) for f in Field.objects.filter(standard__prefix='dc'))
        count = 0
        batch_size = 500
        process_thread = None
        if all:
            total_count = Record.objects.count()
        else:
            processed_updates = []
            to_update = []
            to_delete = []
            for id,record,delete in SolrIndexUpdates.objects.all()[:batch_size].values_list('id', 'record', 'delete'):
                processed_updates.append(id)
                if delete:
                    to_delete.append(record)
                else:
                    to_update.append(record)
            if to_delete:
                conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete)))
            total_count = len(to_update)

        if verbose: pb = ProgressBar(total_count)
        while True:
            if verbose: pb.update(count)
            if all:
                record_ids = Record.objects.all()[count:count + batch_size].values_list('id', flat=True)
            else:
                record_ids = Record.objects.filter(id__in=to_update)[count:count + batch_size].values_list('id', flat=True)
            if not record_ids:
                break
            # convert to plain list, because Django's value lists will add a LIMIT clause when used
            # in an __in query, which causes MySQL to break
            record_ids = list(record_ids)
            media_dict = self._preload_related(Media, record_ids)
            fieldvalue_dict = self._preload_related(FieldValue, record_ids, related=2)
            groups_dict = self._preload_related(CollectionItem, record_ids)
            count += len(record_ids)

            def process_data(groups, fieldvalues, media):
                def process():
                    docs = []
                    for record in Record.objects.filter(id__in=record_ids):
                        docs += [self._record_to_solr(record, core_fields, groups.get(record.id, []),
                                                      fieldvalues.get(record.id, []), media.get(record.id, []))]
                    conn.add(docs)
                return process

            if process_thread:
                process_thread.join()
            process_thread = Thread(target=process_data(groups_dict, fieldvalue_dict, media_dict))
            process_thread.start()
            reset_queries()

        if process_thread:
            process_thread.join()
        if verbose: pb.done()

        if all:
            SolrIndexUpdates.objects.filter(delete=False).delete()
        else:
            SolrIndexUpdates.objects.filter(id__in=processed_updates).delete()
Example #18
0
                     type = self.type,
                     original_id = self.key(row),
                     content_hash = hash,
                 )
                 self.added += 1
             else:
                 logging.error("No instance created: %s %s" % (self.model_name, self.key(row)))
                 self.errors += 1
     count += 1
     if not (count % 1000): reset_queries()
     if pb: pb.update(count)
 if pb: pb.done()
 reset_queries()
 if self.object_history and self.supports_deletion:
     print "Removing unused objects"
     pb = ProgressBar(len(self.object_history))
     count = 0
     for oid, o in self.object_history.iteritems():
         if self.preserve_memory:
             o = ObjectHistory.objects.get(content_type=self.content_type,
                      m2m_content_type=self.m2m_content_type,
                      type=self.type,
                      original_id=oid)
         # these objects have been deleted since the last migration
         if not self.m2m_model:
             self.model.objects.filter(id=o.object_id).delete()
         else:
             self.m2m_delete(object_id=o.object_id, m2m_object_id=o.m2m_object_id)
         logging.debug('%s %s not in source, deleting' % (self.model_name, o.original_id))
         self.deleted += 1
         o.delete()
Example #19
0
    def index(self, verbose=False, all=False, collections=None):
        from .models import SolrIndexUpdates
        self._build_group_tree()
        core_fields = dict(
            (f, f.get_equivalent_fields())
            for f in Field.objects.filter(standard__prefix='dc')
        )
        # add VRA Title to support work titles
        try:
            vra_title = Field.objects.get(name='title', standard__prefix='vra')
            core_fields[vra_title] = vra_title.get_equivalent_fields()
        except Field.DoesNotExist:
            pass
        count = 0
        batch_size = 100
        process_thread = None
        if all:
            query = Record.objects.all()
            if collections:
                query = query.filter(collection__in=collections)
            total_count = query.count()
            to_update = None
            to_delete = None
        else:
            processed_updates = []
            to_update = []
            to_delete = []
            updates = SolrIndexUpdates.objects.all()[:batch_size].values_list(
                'id', 'record', 'delete')
            for id, record, delete in updates:
                processed_updates.append(id)
                if delete:
                    to_delete.append(record)
                else:
                    to_update.append(record)
            total_count = len(to_update)

        if not all and not to_update and not to_delete:
            logger.info("Nothing to update in index, returning early")
            return 0

        conn = Solr(settings.SOLR_URL)
        if to_delete:
            conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete)))

        primary_work_record_manager = PrimaryWorkRecordManager()

        if verbose:
            pb = ProgressBar(total_count)

        def get_method(method):
            module, _, function = method.rpartition('.')
            try:
                __import__(module)
                mod = sys.modules[module]
                return getattr(mod, function)
            except Exception as ex:
                logging.debug(
                    "Could not import custom Solr record indexer %s: %s",
                    method, ex)

        def get_custom_doc_processor():
            method = getattr(settings, 'SOLR_RECORD_INDEXER', None)
            if method:
                method = get_method(method)
            return method or (lambda doc, **kwargs: doc)

        def get_custom_doc_pre_processor():
            method = getattr(settings, 'SOLR_RECORD_PRE_INDEXER', None)
            if method:
                method = get_method(method)
            return method or (lambda **kwargs: None)

        custom_doc_processor = get_custom_doc_processor()
        custom_doc_pre_processor = get_custom_doc_pre_processor()

        while True:
            if verbose:
                pb.update(count)
            if all:
                records = Record.objects.all()
                if collections:
                    records = records.filter(collection__in=collections)
            else:
                records = Record.objects.filter(id__in=to_update)
            records = records[count:count + batch_size]
            record_ids = records.values_list('id', flat=True)
            if not record_ids:
                break
            # convert to plain list, because Django's value lists will add a
            # LIMIT clause when used in an __in query, which causes MySQL to
            # break.  (ph): also, made an explicit separate value for this
            record_id_list = list(record_ids)
            media_dict = self._preload_related(Media, record_id_list)
            fieldvalue_dict = self._preload_related(FieldValue, record_id_list,
                                                    fields=('field',))
            groups_dict = self._preload_related(CollectionItem, record_id_list)

            image_to_works = self._preload_image_to_works(record_id_list)
            work_to_images = self._preload_work_to_images(record_id_list)

            implicit_primary_work_records = primary_work_record_manager \
                .get_implicit_primary_work_records(record_id_list)

            count += len(record_id_list)

            # VERY IMPORTANT:  SINCE process_data RUNS IN ANOTHER THREAD, IT
            # CANNOT DIRECTLY ACCESS ANY VARIABLES FROM THE OUTER SCOPE
            # ALWAYS PASS IN ANY NEEDED VARIABLES

            def process_data(groups, fieldvalues, media, record_id_list,
                             image_to_works, work_to_images,
                             implicit_primary_work_records):
                def process():
                    docs = []
                    for record in Record.objects.filter(id__in=record_id_list):
                        g = groups.get(record.id, [])
                        fv = fieldvalues.get(record.id, [])
                        m = media.get(record.id, [])
                        custom_doc_pre_processor(
                            record=record,
                            core_fields=core_fields,
                            groups=g,
                            fieldvalues=fv,
                            media=m,
                        )
                        doc = self._record_to_solr(
                            record, core_fields, g, fv, m,
                            image_to_works,
                            work_to_images,
                            implicit_primary_work_records
                        )
                        doc = custom_doc_processor(
                            doc,
                            record=record,
                            core_fields=core_fields,
                            groups=g,
                            fieldvalues=fv,
                            media=m,
                        )
                        docs.append(doc)
                    conn.add(docs)
                return process

            if process_thread:
                process_thread.join()
            process_thread = Thread(
                target=process_data(groups_dict, fieldvalue_dict,
                                    media_dict, record_id_list,
                                    image_to_works, work_to_images,
                                    implicit_primary_work_records))
            process_thread.start()
            reset_queries()

        if process_thread:
            process_thread.join()
        if verbose:
            pb.done()

        if all:
            # TODO: this will remove objects that have been added
            # in the meantime
            SolrIndexUpdates.objects.filter(delete=False).delete()
        else:
            SolrIndexUpdates.objects.filter(id__in=processed_updates).delete()

        return count
Example #20
0
    def index(self, verbose=False, all=False):
        from models import SolrIndexUpdates
        self._build_group_tree()
        core_fields = dict(
            (f, f.get_equivalent_fields())
            for f in Field.objects.filter(standard__prefix='dc'))
        count = 0
        batch_size = 500
        process_thread = None
        if all:
            total_count = Record.objects.count()
            to_update = None
            to_delete = None
        else:
            processed_updates = []
            to_update = []
            to_delete = []
            updates = SolrIndexUpdates.objects.all()[:batch_size].values_list(
                'id', 'record', 'delete')
            for id, record, delete in updates:
                processed_updates.append(id)
                if delete:
                    to_delete.append(record)
                else:
                    to_update.append(record)
            total_count = len(to_update)

        if not all and not to_update and not to_delete:
            logger.info("Nothing to update in index, returning early")
            return 0

        conn = Solr(settings.SOLR_URL)
        if to_delete:
            conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete)))

        if verbose:
            pb = ProgressBar(total_count)
        while True:
            if verbose:
                pb.update(count)
            if all:
                records = Record.objects.all()
            else:
                records = Record.objects.filter(id__in=to_update)
            records = records[count:count + batch_size]
            record_ids = records.values_list('id', flat=True)
            if not record_ids:
                break
            # convert to plain list, because Django's value lists will add a
            # LIMIT clause when used in an __in query, which causes MySQL to
            # break.  (ph): also, made an explicit separate value for this
            record_id_list = list(record_ids)
            media_dict = self._preload_related(Media, record_id_list)
            fieldvalue_dict = self._preload_related(FieldValue,
                                                    record_id_list,
                                                    related=2)
            groups_dict = self._preload_related(CollectionItem, record_id_list)
            count += len(record_id_list)

            def process_data(groups, fieldvalues, media, record_id_list):
                def process():
                    docs = []
                    for record in Record.objects.filter(id__in=record_id_list):
                        docs.append(
                            self._record_to_solr(
                                record, core_fields, groups.get(record.id, []),
                                fieldvalues.get(record.id, []),
                                media.get(record.id, [])))
                    conn.add(docs)

                return process

            if process_thread:
                process_thread.join()
            process_thread = Thread(target=process_data(
                groups_dict, fieldvalue_dict, media_dict, record_id_list))
            process_thread.start()
            reset_queries()

        if process_thread:
            process_thread.join()
        if verbose:
            pb.done()

        if all:
            SolrIndexUpdates.objects.filter(delete=False).delete()
        else:
            SolrIndexUpdates.objects.filter(id__in=processed_updates).delete()

        return count
Example #21
0
                     original_id=self.key(row),
                     content_hash=hash,
                 )
                 self.added += 1
             else:
                 logging.error("No instance created: %s %s" %
                               (self.model_name, self.key(row)))
                 self.errors += 1
     count += 1
     if not (count % 1000): reset_queries()
     if pb: pb.update(count)
 if pb: pb.done()
 reset_queries()
 if self.object_history and self.supports_deletion:
     print "Removing unused objects"
     pb = ProgressBar(len(self.object_history))
     count = 0
     to_delete = [
     ]  # Delete many objects at once for better performance
     for oid, o in self.object_history.iteritems():
         if self.preserve_memory:
             o = ObjectHistory.objects.get(
                 content_type=self.content_type,
                 m2m_content_type=self.m2m_content_type,
                 type=self.type,
                 original_id=oid)
         # these objects have been deleted since the last migration
         logging.debug('%s %s not in source, deleting' %
                       (self.model_name, o.original_id))
         if not self.m2m_model:
             to_delete.append(o)
Example #22
0
    def handle(self, *args, **kwargs):

        system_field = get_system_field()

        collections = map(int, kwargs.get('collections') or list())
        mapping_file = kwargs.get('mapping_file')

        if not collections:
            print "--collection is a required parameter"
            return

        if not mapping_file:
            print "--mapping is a required parameter"
            return

        mappings = dict()
        with open(mapping_file, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                mappings[row['Identifier']] = (row['Work'], row['Primary'])

        related_field = Field.objects.get(
            standard__prefix='dc',
            name='relation',
        )

        existing_works = FieldValue.objects.filter(
            record__collection__in=collections,
            field=related_field,
            refinement='IsPartOf',
        )

        # Clean out old relations
        print "Deleting old works info"
        existing_works.delete()

        id_fields = standardfield_ids('identifier', equiv=True)

        print "Fetching records"
        identifiers = FieldValue.objects.select_related('record').filter(
            record__collection__in=collections,
            field__in=id_fields,
        )

        pb = ProgressBar(identifiers.count())

        # Insert new relations
        for count, identifier in enumerate(identifiers):

            work, isprimary = mappings.get(identifier.value, (None, False))
            isprimary = isprimary == 'True'
            if not work:
                print "Warning: no entry found for identifier '%s'" % \
                      identifier.value
                continue

            FieldValue.objects.create(record=identifier.record,
                                      field=related_field,
                                      refinement='IsPartOf',
                                      value=work,
                                      hidden=True)

            fv = list(
                FieldValue.objects.filter(record=identifier.record,
                                          field=system_field,
                                          label='primary-work-record'))
            if len(fv) > 0:
                if not isprimary:
                    for f in fv:
                        f.delete()
            elif isprimary:
                FieldValue.objects.create(
                    record=identifier.record,
                    field=system_field,
                    label='primary-work-record',
                    value=work,
                    hidden=True,
                )

            pb.update(count)

        pb.done()
Example #23
0
    def handle(self, *args, **kwargs):

        mapping_file = kwargs.get('mapping_file')
        collections = map(int, kwargs.get('collections') or list())

        if not mapping_file or not collections:
            print "--collection and --mapping are required parameters"
            return

        works = dict()

        with open(mapping_file, 'rU') as mappings:
            reader = csv.DictReader(mappings)
            for row in reader:
                identifier = row['ImageFileName']
                work = row['fk_WorkID']
                works.setdefault(work, []).append(identifier)

        # Clean out old relations
        FieldValue.objects.filter(
            record__collection__in=collections,
            field__standard__prefix='dc',
            field__name='relation',
            refinement='IsPartOf',
        ).delete()

        related_field = Field.objects.get(
            standard__prefix='dc',
            name='relation',
        )

        id_fields = standardfield_ids('identifier', equiv=True)

        print "Caching record identifiers"
        identifiers = dict()
        values = FieldValue.objects.select_related('record').filter(
            record__collection__in=collections, field__in=id_fields)
        for fv in values:
            identifiers[fv.value] = fv.record.id

        pb = ProgressBar(len(works))

        # Insert new relations
        for count, work in enumerate(works.itervalues()):
            primary = work[0]
            items = work[1:]
            for item in items:
                options = [item]
                if item.lower().endswith('.jpg'):
                    options.append(item[:-4])
                record = None
                for option in options:
                    record = identifiers.get(option)
                    if record:
                        break
                else:
                    continue
                FieldValue.objects.create(record=Record.objects.get(id=record),
                                          field=related_field,
                                          refinement='IsPartOf',
                                          value=primary)

            pb.update(count)

        pb.done()
Example #24
0
    def run(self, step=None, steps=None):
        def compare_hash(historic, current):
            if self.preserve_memory:
                return historic == int(current, 16)
            else:
                return historic.content_hash == current

        print "\n%sMigrating %s" % ('Step %s of %s: ' %
                                    (step, steps) if step and steps else '',
                                    self.model_name)
        r = re.match('^SELECT (.+) FROM (.+)$', self.query)
        pb = ProgressBar(
            list(self.cursor.execute("SELECT COUNT(*) FROM %s" %
                                     r.groups()[1]))[0][0]) if r else None
        count = 0
        merged_ids = dict()
        for row in self.cursor.execute(self.query):
            hash = self.hash(row)
            h = self.object_history.pop(self.key(row), None)
            create = True
            if h:
                if compare_hash(h, hash) or self.m2m_model:
                    # object unchanged, don't need to do anything
                    # or, we're working on a many-to-many relation, don't need to do anything on the instance
                    logging.debug('%s %s unchanged in source, skipping' %
                                  (self.model_name, self.key(row)))
                    create = False
                    self.unchanged += 1
                elif compare_hash(h, STATIC_CONTENT_HASH):
                    # object may have changed, but we don't have the hash of the previous version
                    # so we can't know.  Just store the new hash in the history to be able
                    # to track future changes
                    if self.preserve_memory:
                        h = ObjectHistory.objects.get(
                            content_type=self.content_type,
                            m2m_content_type=self.m2m_content_type,
                            type=self.type,
                            original_id=self.key(row))
                    h.content_hash = hash
                    h.save()
                    create = False
                    self.nohistory += 1
                else:
                    if self.preserve_memory:
                        h = ObjectHistory.objects.get(
                            content_type=self.content_type,
                            m2m_content_type=self.m2m_content_type,
                            type=self.type,
                            original_id=self.key(row))
                    # object changed, need to update
                    try:
                        instance = self.model.objects.get(id=h.object_id)
                        create = False
                    except ObjectDoesNotExist:
                        instance = None
                    if not instance:
                        # object has been deleted, need to recreate
                        logging.debug(
                            '%s %s changed and not in destination, recreating'
                            % (self.model_name, row.ID))
                        h.delete()
                        self.recreated += 1
                    else:
                        # update existing object
                        logging.debug('%s %s changed, updating' %
                                      (self.model_name, self.key(row)))
                        self.update(instance, row)
                        try:
                            instance.save()
                            self.post_save(instance, row)
                            h.content_hash = hash
                            h.save()
                            self.updated += 1
                        except (IntegrityError, pyodbc.IntegrityError), ex:
                            logging.error("Integrity error: %s %s" %
                                          (self.model_name, self.key(row)))
                            logging.error(ex)
                            self.errors += 1
            if create:
                # object does not exist, need to create
                logging.debug('%s %s not in destination, creating' %
                              (self.model_name, self.key(row)))
                if not self.m2m_model:
                    try:
                        instance = self.create(row)
                        if instance:
                            instance.save()
                            self.post_save(instance, row)
                            ObjectHistory.objects.create(
                                content_type=self.content_type,
                                object_id=instance.id,
                                type=self.type,
                                original_id=self.key(row),
                                content_hash=hash,
                            )
                            self.added += 1
                        else:
                            logging.error("No instance created: %s %s" %
                                          (self.model_name, self.key(row)))
                            self.errors += 1
                    except (IntegrityError, pyodbc.IntegrityError,
                            ValueError), ex:
                        logging.error("%s: %s %s" %
                                      (type(ex).__name__, self.model_name,
                                       self.key(row)))
                        logging.error(ex)
                        self.errors += 1
                    except MergeObjectsException, ex:
                        merged_ids[self.key(row)] = ex.instance
Example #25
0
    def handle(self, *args, **kwargs):

        system_field = get_system_field()

        collections = map(int, kwargs.get('collections') or list())
        mapping_file = kwargs.get('mapping_file')

        if not collections:
            print "--collection is a required parameter"
            return

        if not mapping_file:
            print "--mapping is a required parameter"
            return

        mappings = dict()
        with open(mapping_file, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                mappings[row['Identifier']] = (row['Work'], row['Primary'])

        related_field = Field.objects.get(
            standard__prefix='dc',
            name='relation',
        )

        existing_works = FieldValue.objects.filter(
            record__collection__in=collections,
            field=related_field,
            refinement='IsPartOf',
        )

        # Clean out old relations
        print "Deleting old works info"
        existing_works.delete()

        id_fields = standardfield_ids('identifier', equiv=True)

        print "Fetching records"
        identifiers = FieldValue.objects.select_related('record').filter(
            record__collection__in=collections,
            field__in=id_fields,
        )

        pb = ProgressBar(identifiers.count())

        # Insert new relations
        for count, identifier in enumerate(identifiers):

            work, isprimary = mappings.get(identifier.value, (None, False))
            isprimary = isprimary == 'True'
            if not work:
                print "Warning: no entry found for identifier '%s'" % \
                      identifier.value
                continue

            FieldValue.objects.create(
                record=identifier.record,
                field=related_field,
                refinement='IsPartOf',
                value=work,
                hidden=True
            )

            fv = list(FieldValue.objects.filter(
                record=identifier.record,
                field=system_field,
                label='primary-work-record'
            ))
            if len(fv) > 0:
                if not isprimary:
                    for f in fv:
                        f.delete()
            elif isprimary:
                FieldValue.objects.create(
                    record=identifier.record,
                    field=system_field,
                    label='primary-work-record',
                    value=work,
                    hidden=True,
                )

            pb.update(count)

        pb.done()