def fix_sort_date(import_date: datetime.datetime): """ Tries to guess the correct sort date for all papers and files that were created no later than import_date by looking at a) the legal date, b) the the date of the earliest consultation or c) falling back to fallback_date """ logger.info("Fixing the sort date of the papers") # Use the date of the earliest consultation earliest_consultation = (Consultation.objects.filter( paper=OuterRef("pk"), meeting__isnull=False).order_by( "meeting__start").values("meeting__start")[:1]) papers_with_consultation = ( Paper.objects.filter( Q(sort_date=fallback_date) | ~Q(sort_date=F("legal_date"))).annotate( earliest_consultation=Subquery(earliest_consultation)).filter( earliest_consultation__isnull=False) # We filter on these to only update those necessary in elasticsearch .filter(~Q(sort_date=F("earliest_consultation")) & ~Q(display_date=F("earliest_consultation")))) num = papers_with_consultation.update( sort_date=F("earliest_consultation"), display_date=F("earliest_consultation")) if settings.ELASTICSEARCH_ENABLED: search_bulk_index(Paper, papers_with_consultation) logger.info(f"{num} sort dates were fix by the earliest consultation") logger.info("Fixing the sort date of the files") num = File.objects.filter(created__lte=import_date, legal_date__isnull=False).update( sort_date=F("legal_date"), modified=F("legal_date")) logger.info(f"{num} files were changed") earliest_paper = (Paper.objects.filter( files__pk=OuterRef("pk")).order_by("sort_date").values("sort_date")[:1] ) file_with_paper = ( File.objects.filter(legal_date__isnull=True).annotate( earliest_paper=Subquery(earliest_paper)).filter( earliest_paper__isnull=False) # We filter on these to only update those necessary in elasticsearch .filter(~Q(sort_date=F("earliest_paper")))) num = file_with_paper.update(sort_date=F("earliest_paper")) if settings.ELASTICSEARCH_ENABLED: search_bulk_index(Paper, file_with_paper) logger.info(f"{num} files updated")
def incremental_import( current_model: Type[django.db.models.Model], json_objects: Iterable[Dict[str, Any]], soft_delete: bool = True, ): """Compared the objects in the database with the json data for a given objects and creates, updates and (soft-)deletes the appropriate records.""" json_map = dict() for json_dict in json_objects: key = tuple(json_dict[j] for j in unique_field_dict[current_model]) json_map[key] = json_dict # Remove manually deleted files if current_model == models.File: # noinspection PyUnresolvedReferences manually_deleted = current_model.objects_with_deleted.filter( manually_deleted=True).values_list("oparl_id", flat=True) for i in manually_deleted: if (i, ) in json_map: del json_map[(i, )] # Handle undeleted objects, e.g. papers that disappeared and reappeared if issubclass(current_model, DefaultFields): deleted = current_model.objects_with_deleted.filter( deleted=True, oparl_id__isnull=False).values_list("oparl_id", flat=True) oparls_ids = [i.get("oparl_id") for i in json_objects] to_undelete = set(deleted) & set(oparls_ids) if to_undelete: logger.info( f"{current_model.__name__}: Undeleting {len(to_undelete)}") current_model.objects_with_deleted.filter( oparl_id__in=to_undelete).update(deleted=False) db_ids, db_map = get_from_db(current_model) common = set(json_map.keys()) & set(db_map.keys()) to_be_created = set(json_map.keys()) - common to_be_deleted = set(db_map.keys()) - common to_be_updated = [] for existing in common: if json_map[existing] != db_map[existing]: to_be_updated.append((json_map[existing], db_ids[existing])) # We need to delete first and then create to avoid conflicts e.g. when the start of a meeting with an oparl_id # changed deletion_ids = [db_ids[i1] for i1 in to_be_deleted] logger.info(f"{current_model.__name__}: " f"Deleting {len(to_be_deleted)}, " f"Creating {len(to_be_created)} and " f"Updating {len(to_be_updated)}") # Since we don't get the bulk created object ids back from django (yet?), # we just do this by timestamp - indexing more that necessary isn't wrong anyway before_bulk_create = timezone.now() if soft_delete: deleted_rows = current_model.objects.filter( id__in=deletion_ids).update(deleted=True, modified=timezone.now()) else: current_model.objects.filter(id__in=deletion_ids).delete() deleted_rows = 0 # TODO: Delete files to_be_created = [current_model(**json_map[i1]) for i1 in to_be_created] current_model.objects.bulk_create(to_be_created, batch_size=100) # Bulk create doesn't update the search index, so we do this manually if settings.ELASTICSEARCH_ENABLED and current_model in registry.get_models( ): # Changed/Created qs = current_model.objects.filter(modified__gte=before_bulk_create) qs_count = qs.count() assert qs_count >= len( to_be_created ), f"Only {qs_count} {current_model.__name__} were found for indexing, while at least {len(to_be_created)} were expected" logger.info( f"Indexing {qs_count} {current_model.__name__} new objects") search_bulk_index(current_model, qs) # Deleted qs = current_model.objects_with_deleted.filter( deleted=True, modified__gte=before_bulk_create) qs_count = qs.count() assert ( qs_count >= deleted_rows ), f"Only {qs_count} {current_model.__name__} for deletion, while at least {deleted_rows} were expected" logger.info( f"Deleting {qs_count} {current_model.__name__} from elasticsearch") search_bulk_index(current_model, qs, action="delete") with transaction.atomic(): for json_object, pk in tqdm( to_be_updated, disable=not to_be_updated, desc=f"Update or create for {current_model.__name__}", ): current_model.objects_with_deleted.update_or_create( pk=pk, defaults=json_object)