def clear_missing(self, verbose=False): conn = Solr(settings.SOLR_URL) start = 0 to_delete = [] pb = None if verbose: print "Checking for indexed records no longer in database" while True: if verbose and pb: pb.update(start) result = conn.search('*:*', sort='id asc', start=start, rows=500, fields=['id']) if not result: break if verbose and not pb: pb = ProgressBar(result.hits) ids = [int(r['id']) for r in result] records = Record.objects.filter(id__in=ids).values_list('id', flat=True) for r in records: ids.remove(r) to_delete.extend(ids) start += 500 if verbose and pb: pb.done() pb = None if verbose and to_delete: print "Removing unneeded records from index" pb = ProgressBar(len(to_delete)) while to_delete: if verbose and pb: pb.update(pb.total - len(to_delete)) conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete[:500]))) to_delete = to_delete[500:] if verbose and pb: pb.done()
def handle(self, *args, **kwargs): data_file = kwargs.get('data_file') collections = map(int, kwargs.get('collections') or list()) separator = kwargs.get('separator') fields = list( Field.objects .filter(fieldvalue__record__collection__in=collections) .distinct() ) with open(data_file, 'w') as csvfile: fieldnames = [field.full_name for field in fields] fieldnames.extend(['__file__', '__path__']) writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() records = Record.objects.filter(collection__in=collections) pb = ProgressBar(records.count()) for count, record in enumerate(records): values = record.get_fieldvalues() media = list(record.media_set.select_related('storage').all()) while values or media: row = dict() extra_values = list() for value in values: fieldname = value.field.full_name v = value.value.encode('utf8') if fieldname in row: if not separator: extra_values.append(value) else: row[fieldname] += separator + v else: row[fieldname] = v if media: m = media.pop() row['__file__'] = m.url row['__path__'] = m.storage.base writer.writerow(row) values = extra_values pb.update(count) pb.done()
def handle(self, *args, **kwargs): updated = 0 pb = ProgressBar(Media.objects.count()) for count, media in enumerate(Media.objects.all()): name = slugify(os.path.splitext(os.path.basename(media.url))[0]) if name != media.name: media.name = name media.save(force_update_name=True) updated += 1 pb.update(count) pb.done() print "Updated %d media objects" % updated
def index(self, verbose=False, all=False, collections=None): from models import SolrIndexUpdates self._build_group_tree() core_fields = dict( (f, f.get_equivalent_fields()) for f in Field.objects.filter(standard__prefix='dc')) # add VRA Title to support work titles try: vra_title = Field.objects.get(name='title', standard__prefix='vra') core_fields[vra_title] = vra_title.get_equivalent_fields() except Field.DoesNotExist: pass count = 0 batch_size = 100 process_thread = None if all: query = Record.objects.all() if collections: query = query.filter(collection__in=collections) total_count = query.count() to_update = None to_delete = None else: processed_updates = [] to_update = [] to_delete = [] updates = SolrIndexUpdates.objects.all()[:batch_size].values_list( 'id', 'record', 'delete') for id, record, delete in updates: processed_updates.append(id) if delete: to_delete.append(record) else: to_update.append(record) total_count = len(to_update) if not all and not to_update and not to_delete: logger.info("Nothing to update in index, returning early") return 0 conn = Solr(settings.SOLR_URL) if to_delete: conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete))) primary_work_record_manager = PrimaryWorkRecordManager() if verbose: pb = ProgressBar(total_count) def get_method(method): module, _, function = method.rpartition('.') try: __import__(module) mod = sys.modules[module] return getattr(mod, function) except Exception, ex: logging.debug( "Could not import custom Solr record indexer %s: %s", method, ex)
def remove(self): common = self.check() print "Removing unneeded media objects" pb = ProgressBar(len(common)) count = 0 for id in common: m = Media.objects.filter(record__id=id) m.filter(url__startswith='medium\\').delete() m.filter(url__startswith='thumb\\').delete() count += 1 pb.update(count) pb.done()
def handle(self, *prefix, **options): if not prefix: print self.help else: count = updated = 0 total = Media.objects.count() pb = ProgressBar(total) for i in range(0, total, 100): for media in Media.objects.all()[i:i+100]: if media.url.startswith(prefix): media.url = media.url[len(prefix):] media.save() updated += 1 count += 1 pb.update(count) reset_queries() pb.done() print "Updated %d/%d media objects" % (updated, count)
def handle(self, *args, **kwargs): updated = 0 id_fields = standardfield('identifier', equiv=True) titles = FieldValue.objects.select_related('record').filter(field__in=id_fields) pb = ProgressBar(titles.count()) for count, title in enumerate(titles): name = slugify(title.value) if name != title.record.name: title.record.name = name title.record.save(force_update_name=True) updated += 1 pb.update(count) pb.done() print "Updated %d record objects" % updated
def handle(self, *args, **kwargs): coll = kwargs.get('collection') if not coll: print "--collection is a required parameter" return if coll.isdigit(): collection = Collection.objects.get(id=coll) else: collection = Collection.objects.get(name=coll) admins = User.objects.filter(is_superuser=True) if admins: admin = admins[0] else: admin = None pb = ProgressBar(collection.records.count()) for count, record in enumerate(collection.records.all()): get_thumbnail_for_record(record, admin) get_thumbnail_for_record(record, admin, crop_to_square=True) pb.update(count) pb.done()
def refresh(self): count = 0 total = Media.objects.count() pb = ProgressBar(total) for i in range(0, total, 1000): for media in Media.objects.all()[i:i+1000]: media.identify() count += 1 pb.update(count) reset_queries() pb.done()
def handle(self, *args, **kwargs): data_file = kwargs.get('data_file') collections = map(int, kwargs.get('collections') or list()) separator = kwargs.get('separator') fields = list( Field.objects .filter(fieldvalue__record__collection__in=collections) .distinct() ) with open(data_file, 'w') as csvfile: fieldnames = [field.full_name for field in fields] fieldnames.extend(['__file__', '__path__']) writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() records = Record.objects.filter(collection__in=collections) pb = ProgressBar(records.count()) for count, record in enumerate(records): values = record.get_fieldvalues() media = list(record.media_set.select_related('storage').all()) while values or media: row = dict() extra_values = list() for value in values: fieldname = value.field.full_name v = value.value if fieldname in row: if not separator: extra_values.append(value) else: row[fieldname] += separator + v else: row[fieldname] = v if media: m = media.pop() row['__file__'] = m.url row['__path__'] = m.storage.base writer.writerow(row) values = extra_values pb.update(count) pb.done()
def handle(self, *prefix, **options): if not prefix: print(self.help) else: count = updated = 0 total = Media.objects.count() pb = ProgressBar(total) for i in range(0, total, 100): for media in Media.objects.all()[i:i + 100]: if media.url.startswith(prefix): media.url = media.url[len(prefix):] media.save() updated += 1 count += 1 pb.update(count) reset_queries() pb.done() print("Updated %d/%d media objects" % (updated, count))
def handle(self, *args, **kwargs): updated = 0 id_fields = standardfield('identifier', equiv=True) titles = FieldValue.objects.select_related('record').filter( field__in=id_fields) pb = ProgressBar(titles.count()) for count, title in enumerate(titles): name = slugify(title.value) if name != title.record.name: title.record.name = name title.record.save(force_update_name=True) updated += 1 pb.update(count) pb.done() print("Updated %d record objects" % updated)
def handle(self, from_collection, to_collections, commit, *args, **options): if not from_collection or not to_collections: print("Error: Must specify --from and --to arguments") return print("Mapping presentation items from collection %s to " \ "collection(s) %s" % (from_collection, to_collections)) idfields = standardfield_ids('identifier', equiv=True) print("Fetching identifiers") query = FieldValue.objects.filter( field__in=idfields, record__collectionitem__collection=from_collection, owner=None, context_type=None, hidden=False).values_list('value', 'record') record_to_id = dict() for identifier, record in query: record_to_id.setdefault(record, []).append(identifier) print("Fetching target records") query = FieldValue.objects.filter( field__in=idfields, record__collectionitem__collection__in=to_collections, owner=None, context_type=None, hidden=False).values_list('value', 'record') id_to_record = dict() for identifier, record in query: id_to_record.setdefault(identifier, []).append(record) print("Mapping presentation items") remapped = 0 errors = [] items = PresentationItem.objects.filter( record__collectionitem__collection=from_collection) pb = ProgressBar(len(items)) for count, item in enumerate(items): identifiers = record_to_id.get(item.record_id) if identifiers: for identifier in identifiers: new_records = id_to_record.get(identifier) if new_records: if len(new_records) == 1: remapped += 1 if commit: item.record_id = new_records[0] item.save() break else: errors.append( "Multiple matching records with identifier " "'%s' found in collection %s: %s" % (identifier, to_collections, sorted(new_records))) else: errors.append( "No record with identifier '%s' found in " "collection %s" % (identifier, to_collections)) else: errors.append("No identifier found for record %s" % item.record_id) pb.update(count) pb.done() errors = sorted(set(errors)) if commit: print("Remapped %s items" % remapped) else: print("Would have remapped %s items - rerun with --commit" % \ remapped) if errors: print("%s unique errors occurred:" % len(errors)) print('\n'.join(errors))
def index(self, verbose=False, all=False): from models import SolrIndexUpdates self._build_group_tree() conn = Solr(settings.SOLR_URL) core_fields = dict((f, f.get_equivalent_fields()) for f in Field.objects.filter(standard__prefix='dc')) count = 0 batch_size = 500 process_thread = None if all: total_count = Record.objects.count() else: processed_updates = [] to_update = [] to_delete = [] for id,record,delete in SolrIndexUpdates.objects.all()[:batch_size].values_list('id', 'record', 'delete'): processed_updates.append(id) if delete: to_delete.append(record) else: to_update.append(record) if to_delete: conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete))) total_count = len(to_update) if verbose: pb = ProgressBar(total_count) while True: if verbose: pb.update(count) if all: record_ids = Record.objects.all()[count:count + batch_size].values_list('id', flat=True) else: record_ids = Record.objects.filter(id__in=to_update)[count:count + batch_size].values_list('id', flat=True) if not record_ids: break # convert to plain list, because Django's value lists will add a LIMIT clause when used # in an __in query, which causes MySQL to break record_ids = list(record_ids) media_dict = self._preload_related(Media, record_ids) fieldvalue_dict = self._preload_related(FieldValue, record_ids, related=2) groups_dict = self._preload_related(CollectionItem, record_ids) count += len(record_ids) def process_data(groups, fieldvalues, media): def process(): docs = [] for record in Record.objects.filter(id__in=record_ids): docs += [self._record_to_solr(record, core_fields, groups.get(record.id, []), fieldvalues.get(record.id, []), media.get(record.id, []))] conn.add(docs) return process if process_thread: process_thread.join() process_thread = Thread(target=process_data(groups_dict, fieldvalue_dict, media_dict)) process_thread.start() reset_queries() if process_thread: process_thread.join() if verbose: pb.done() if all: SolrIndexUpdates.objects.filter(delete=False).delete() else: SolrIndexUpdates.objects.filter(id__in=processed_updates).delete()
type = self.type, original_id = self.key(row), content_hash = hash, ) self.added += 1 else: logging.error("No instance created: %s %s" % (self.model_name, self.key(row))) self.errors += 1 count += 1 if not (count % 1000): reset_queries() if pb: pb.update(count) if pb: pb.done() reset_queries() if self.object_history and self.supports_deletion: print "Removing unused objects" pb = ProgressBar(len(self.object_history)) count = 0 for oid, o in self.object_history.iteritems(): if self.preserve_memory: o = ObjectHistory.objects.get(content_type=self.content_type, m2m_content_type=self.m2m_content_type, type=self.type, original_id=oid) # these objects have been deleted since the last migration if not self.m2m_model: self.model.objects.filter(id=o.object_id).delete() else: self.m2m_delete(object_id=o.object_id, m2m_object_id=o.m2m_object_id) logging.debug('%s %s not in source, deleting' % (self.model_name, o.original_id)) self.deleted += 1 o.delete()
def index(self, verbose=False, all=False, collections=None): from .models import SolrIndexUpdates self._build_group_tree() core_fields = dict( (f, f.get_equivalent_fields()) for f in Field.objects.filter(standard__prefix='dc') ) # add VRA Title to support work titles try: vra_title = Field.objects.get(name='title', standard__prefix='vra') core_fields[vra_title] = vra_title.get_equivalent_fields() except Field.DoesNotExist: pass count = 0 batch_size = 100 process_thread = None if all: query = Record.objects.all() if collections: query = query.filter(collection__in=collections) total_count = query.count() to_update = None to_delete = None else: processed_updates = [] to_update = [] to_delete = [] updates = SolrIndexUpdates.objects.all()[:batch_size].values_list( 'id', 'record', 'delete') for id, record, delete in updates: processed_updates.append(id) if delete: to_delete.append(record) else: to_update.append(record) total_count = len(to_update) if not all and not to_update and not to_delete: logger.info("Nothing to update in index, returning early") return 0 conn = Solr(settings.SOLR_URL) if to_delete: conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete))) primary_work_record_manager = PrimaryWorkRecordManager() if verbose: pb = ProgressBar(total_count) def get_method(method): module, _, function = method.rpartition('.') try: __import__(module) mod = sys.modules[module] return getattr(mod, function) except Exception as ex: logging.debug( "Could not import custom Solr record indexer %s: %s", method, ex) def get_custom_doc_processor(): method = getattr(settings, 'SOLR_RECORD_INDEXER', None) if method: method = get_method(method) return method or (lambda doc, **kwargs: doc) def get_custom_doc_pre_processor(): method = getattr(settings, 'SOLR_RECORD_PRE_INDEXER', None) if method: method = get_method(method) return method or (lambda **kwargs: None) custom_doc_processor = get_custom_doc_processor() custom_doc_pre_processor = get_custom_doc_pre_processor() while True: if verbose: pb.update(count) if all: records = Record.objects.all() if collections: records = records.filter(collection__in=collections) else: records = Record.objects.filter(id__in=to_update) records = records[count:count + batch_size] record_ids = records.values_list('id', flat=True) if not record_ids: break # convert to plain list, because Django's value lists will add a # LIMIT clause when used in an __in query, which causes MySQL to # break. (ph): also, made an explicit separate value for this record_id_list = list(record_ids) media_dict = self._preload_related(Media, record_id_list) fieldvalue_dict = self._preload_related(FieldValue, record_id_list, fields=('field',)) groups_dict = self._preload_related(CollectionItem, record_id_list) image_to_works = self._preload_image_to_works(record_id_list) work_to_images = self._preload_work_to_images(record_id_list) implicit_primary_work_records = primary_work_record_manager \ .get_implicit_primary_work_records(record_id_list) count += len(record_id_list) # VERY IMPORTANT: SINCE process_data RUNS IN ANOTHER THREAD, IT # CANNOT DIRECTLY ACCESS ANY VARIABLES FROM THE OUTER SCOPE # ALWAYS PASS IN ANY NEEDED VARIABLES def process_data(groups, fieldvalues, media, record_id_list, image_to_works, work_to_images, implicit_primary_work_records): def process(): docs = [] for record in Record.objects.filter(id__in=record_id_list): g = groups.get(record.id, []) fv = fieldvalues.get(record.id, []) m = media.get(record.id, []) custom_doc_pre_processor( record=record, core_fields=core_fields, groups=g, fieldvalues=fv, media=m, ) doc = self._record_to_solr( record, core_fields, g, fv, m, image_to_works, work_to_images, implicit_primary_work_records ) doc = custom_doc_processor( doc, record=record, core_fields=core_fields, groups=g, fieldvalues=fv, media=m, ) docs.append(doc) conn.add(docs) return process if process_thread: process_thread.join() process_thread = Thread( target=process_data(groups_dict, fieldvalue_dict, media_dict, record_id_list, image_to_works, work_to_images, implicit_primary_work_records)) process_thread.start() reset_queries() if process_thread: process_thread.join() if verbose: pb.done() if all: # TODO: this will remove objects that have been added # in the meantime SolrIndexUpdates.objects.filter(delete=False).delete() else: SolrIndexUpdates.objects.filter(id__in=processed_updates).delete() return count
def index(self, verbose=False, all=False): from models import SolrIndexUpdates self._build_group_tree() core_fields = dict( (f, f.get_equivalent_fields()) for f in Field.objects.filter(standard__prefix='dc')) count = 0 batch_size = 500 process_thread = None if all: total_count = Record.objects.count() to_update = None to_delete = None else: processed_updates = [] to_update = [] to_delete = [] updates = SolrIndexUpdates.objects.all()[:batch_size].values_list( 'id', 'record', 'delete') for id, record, delete in updates: processed_updates.append(id) if delete: to_delete.append(record) else: to_update.append(record) total_count = len(to_update) if not all and not to_update and not to_delete: logger.info("Nothing to update in index, returning early") return 0 conn = Solr(settings.SOLR_URL) if to_delete: conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete))) if verbose: pb = ProgressBar(total_count) while True: if verbose: pb.update(count) if all: records = Record.objects.all() else: records = Record.objects.filter(id__in=to_update) records = records[count:count + batch_size] record_ids = records.values_list('id', flat=True) if not record_ids: break # convert to plain list, because Django's value lists will add a # LIMIT clause when used in an __in query, which causes MySQL to # break. (ph): also, made an explicit separate value for this record_id_list = list(record_ids) media_dict = self._preload_related(Media, record_id_list) fieldvalue_dict = self._preload_related(FieldValue, record_id_list, related=2) groups_dict = self._preload_related(CollectionItem, record_id_list) count += len(record_id_list) def process_data(groups, fieldvalues, media, record_id_list): def process(): docs = [] for record in Record.objects.filter(id__in=record_id_list): docs.append( self._record_to_solr( record, core_fields, groups.get(record.id, []), fieldvalues.get(record.id, []), media.get(record.id, []))) conn.add(docs) return process if process_thread: process_thread.join() process_thread = Thread(target=process_data( groups_dict, fieldvalue_dict, media_dict, record_id_list)) process_thread.start() reset_queries() if process_thread: process_thread.join() if verbose: pb.done() if all: SolrIndexUpdates.objects.filter(delete=False).delete() else: SolrIndexUpdates.objects.filter(id__in=processed_updates).delete() return count
original_id=self.key(row), content_hash=hash, ) self.added += 1 else: logging.error("No instance created: %s %s" % (self.model_name, self.key(row))) self.errors += 1 count += 1 if not (count % 1000): reset_queries() if pb: pb.update(count) if pb: pb.done() reset_queries() if self.object_history and self.supports_deletion: print "Removing unused objects" pb = ProgressBar(len(self.object_history)) count = 0 to_delete = [ ] # Delete many objects at once for better performance for oid, o in self.object_history.iteritems(): if self.preserve_memory: o = ObjectHistory.objects.get( content_type=self.content_type, m2m_content_type=self.m2m_content_type, type=self.type, original_id=oid) # these objects have been deleted since the last migration logging.debug('%s %s not in source, deleting' % (self.model_name, o.original_id)) if not self.m2m_model: to_delete.append(o)
def handle(self, *args, **kwargs): system_field = get_system_field() collections = map(int, kwargs.get('collections') or list()) mapping_file = kwargs.get('mapping_file') if not collections: print "--collection is a required parameter" return if not mapping_file: print "--mapping is a required parameter" return mappings = dict() with open(mapping_file, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: mappings[row['Identifier']] = (row['Work'], row['Primary']) related_field = Field.objects.get( standard__prefix='dc', name='relation', ) existing_works = FieldValue.objects.filter( record__collection__in=collections, field=related_field, refinement='IsPartOf', ) # Clean out old relations print "Deleting old works info" existing_works.delete() id_fields = standardfield_ids('identifier', equiv=True) print "Fetching records" identifiers = FieldValue.objects.select_related('record').filter( record__collection__in=collections, field__in=id_fields, ) pb = ProgressBar(identifiers.count()) # Insert new relations for count, identifier in enumerate(identifiers): work, isprimary = mappings.get(identifier.value, (None, False)) isprimary = isprimary == 'True' if not work: print "Warning: no entry found for identifier '%s'" % \ identifier.value continue FieldValue.objects.create(record=identifier.record, field=related_field, refinement='IsPartOf', value=work, hidden=True) fv = list( FieldValue.objects.filter(record=identifier.record, field=system_field, label='primary-work-record')) if len(fv) > 0: if not isprimary: for f in fv: f.delete() elif isprimary: FieldValue.objects.create( record=identifier.record, field=system_field, label='primary-work-record', value=work, hidden=True, ) pb.update(count) pb.done()
def handle(self, *args, **kwargs): mapping_file = kwargs.get('mapping_file') collections = map(int, kwargs.get('collections') or list()) if not mapping_file or not collections: print "--collection and --mapping are required parameters" return works = dict() with open(mapping_file, 'rU') as mappings: reader = csv.DictReader(mappings) for row in reader: identifier = row['ImageFileName'] work = row['fk_WorkID'] works.setdefault(work, []).append(identifier) # Clean out old relations FieldValue.objects.filter( record__collection__in=collections, field__standard__prefix='dc', field__name='relation', refinement='IsPartOf', ).delete() related_field = Field.objects.get( standard__prefix='dc', name='relation', ) id_fields = standardfield_ids('identifier', equiv=True) print "Caching record identifiers" identifiers = dict() values = FieldValue.objects.select_related('record').filter( record__collection__in=collections, field__in=id_fields) for fv in values: identifiers[fv.value] = fv.record.id pb = ProgressBar(len(works)) # Insert new relations for count, work in enumerate(works.itervalues()): primary = work[0] items = work[1:] for item in items: options = [item] if item.lower().endswith('.jpg'): options.append(item[:-4]) record = None for option in options: record = identifiers.get(option) if record: break else: continue FieldValue.objects.create(record=Record.objects.get(id=record), field=related_field, refinement='IsPartOf', value=primary) pb.update(count) pb.done()
def run(self, step=None, steps=None): def compare_hash(historic, current): if self.preserve_memory: return historic == int(current, 16) else: return historic.content_hash == current print "\n%sMigrating %s" % ('Step %s of %s: ' % (step, steps) if step and steps else '', self.model_name) r = re.match('^SELECT (.+) FROM (.+)$', self.query) pb = ProgressBar( list(self.cursor.execute("SELECT COUNT(*) FROM %s" % r.groups()[1]))[0][0]) if r else None count = 0 merged_ids = dict() for row in self.cursor.execute(self.query): hash = self.hash(row) h = self.object_history.pop(self.key(row), None) create = True if h: if compare_hash(h, hash) or self.m2m_model: # object unchanged, don't need to do anything # or, we're working on a many-to-many relation, don't need to do anything on the instance logging.debug('%s %s unchanged in source, skipping' % (self.model_name, self.key(row))) create = False self.unchanged += 1 elif compare_hash(h, STATIC_CONTENT_HASH): # object may have changed, but we don't have the hash of the previous version # so we can't know. Just store the new hash in the history to be able # to track future changes if self.preserve_memory: h = ObjectHistory.objects.get( content_type=self.content_type, m2m_content_type=self.m2m_content_type, type=self.type, original_id=self.key(row)) h.content_hash = hash h.save() create = False self.nohistory += 1 else: if self.preserve_memory: h = ObjectHistory.objects.get( content_type=self.content_type, m2m_content_type=self.m2m_content_type, type=self.type, original_id=self.key(row)) # object changed, need to update try: instance = self.model.objects.get(id=h.object_id) create = False except ObjectDoesNotExist: instance = None if not instance: # object has been deleted, need to recreate logging.debug( '%s %s changed and not in destination, recreating' % (self.model_name, row.ID)) h.delete() self.recreated += 1 else: # update existing object logging.debug('%s %s changed, updating' % (self.model_name, self.key(row))) self.update(instance, row) try: instance.save() self.post_save(instance, row) h.content_hash = hash h.save() self.updated += 1 except (IntegrityError, pyodbc.IntegrityError), ex: logging.error("Integrity error: %s %s" % (self.model_name, self.key(row))) logging.error(ex) self.errors += 1 if create: # object does not exist, need to create logging.debug('%s %s not in destination, creating' % (self.model_name, self.key(row))) if not self.m2m_model: try: instance = self.create(row) if instance: instance.save() self.post_save(instance, row) ObjectHistory.objects.create( content_type=self.content_type, object_id=instance.id, type=self.type, original_id=self.key(row), content_hash=hash, ) self.added += 1 else: logging.error("No instance created: %s %s" % (self.model_name, self.key(row))) self.errors += 1 except (IntegrityError, pyodbc.IntegrityError, ValueError), ex: logging.error("%s: %s %s" % (type(ex).__name__, self.model_name, self.key(row))) logging.error(ex) self.errors += 1 except MergeObjectsException, ex: merged_ids[self.key(row)] = ex.instance
def handle(self, *args, **kwargs): system_field = get_system_field() collections = map(int, kwargs.get('collections') or list()) mapping_file = kwargs.get('mapping_file') if not collections: print "--collection is a required parameter" return if not mapping_file: print "--mapping is a required parameter" return mappings = dict() with open(mapping_file, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: mappings[row['Identifier']] = (row['Work'], row['Primary']) related_field = Field.objects.get( standard__prefix='dc', name='relation', ) existing_works = FieldValue.objects.filter( record__collection__in=collections, field=related_field, refinement='IsPartOf', ) # Clean out old relations print "Deleting old works info" existing_works.delete() id_fields = standardfield_ids('identifier', equiv=True) print "Fetching records" identifiers = FieldValue.objects.select_related('record').filter( record__collection__in=collections, field__in=id_fields, ) pb = ProgressBar(identifiers.count()) # Insert new relations for count, identifier in enumerate(identifiers): work, isprimary = mappings.get(identifier.value, (None, False)) isprimary = isprimary == 'True' if not work: print "Warning: no entry found for identifier '%s'" % \ identifier.value continue FieldValue.objects.create( record=identifier.record, field=related_field, refinement='IsPartOf', value=work, hidden=True ) fv = list(FieldValue.objects.filter( record=identifier.record, field=system_field, label='primary-work-record' )) if len(fv) > 0: if not isprimary: for f in fv: f.delete() elif isprimary: FieldValue.objects.create( record=identifier.record, field=system_field, label='primary-work-record', value=work, hidden=True, ) pb.update(count) pb.done()