Ejemplo n.º 1
0
def fix_sort_date(import_date: datetime.datetime):
    """
    Tries to guess the correct sort date for all papers and files that were created no later
    than import_date by looking at
      a) the legal date,
      b) the the date of the earliest consultation or
      c) falling back to fallback_date
    """
    logger.info("Fixing the sort date of the papers")
    # Use the date of the earliest consultation
    earliest_consultation = (Consultation.objects.filter(
        paper=OuterRef("pk"), meeting__isnull=False).order_by(
            "meeting__start").values("meeting__start")[:1])
    papers_with_consultation = (
        Paper.objects.filter(
            Q(sort_date=fallback_date)
            | ~Q(sort_date=F("legal_date"))).annotate(
                earliest_consultation=Subquery(earliest_consultation)).filter(
                    earliest_consultation__isnull=False)
        # We filter on these to only update those necessary in elasticsearch
        .filter(~Q(sort_date=F("earliest_consultation"))
                & ~Q(display_date=F("earliest_consultation"))))
    num = papers_with_consultation.update(
        sort_date=F("earliest_consultation"),
        display_date=F("earliest_consultation"))
    if settings.ELASTICSEARCH_ENABLED:
        search_bulk_index(Paper, papers_with_consultation)
    logger.info(f"{num} sort dates were fix by the earliest consultation")

    logger.info("Fixing the sort date of the files")
    num = File.objects.filter(created__lte=import_date,
                              legal_date__isnull=False).update(
                                  sort_date=F("legal_date"),
                                  modified=F("legal_date"))
    logger.info(f"{num} files were changed")

    earliest_paper = (Paper.objects.filter(
        files__pk=OuterRef("pk")).order_by("sort_date").values("sort_date")[:1]
                      )
    file_with_paper = (
        File.objects.filter(legal_date__isnull=True).annotate(
            earliest_paper=Subquery(earliest_paper)).filter(
                earliest_paper__isnull=False)
        # We filter on these to only update those necessary in elasticsearch
        .filter(~Q(sort_date=F("earliest_paper"))))
    num = file_with_paper.update(sort_date=F("earliest_paper"))
    if settings.ELASTICSEARCH_ENABLED:
        search_bulk_index(Paper, file_with_paper)
    logger.info(f"{num} files updated")
def incremental_import(
    current_model: Type[django.db.models.Model],
    json_objects: Iterable[Dict[str, Any]],
    soft_delete: bool = True,
):
    """Compared the objects in the database with the json data for a given objects and
    creates, updates and (soft-)deletes the appropriate records."""

    json_map = dict()
    for json_dict in json_objects:
        key = tuple(json_dict[j] for j in unique_field_dict[current_model])
        json_map[key] = json_dict

    # Remove manually deleted files
    if current_model == models.File:
        # noinspection PyUnresolvedReferences
        manually_deleted = current_model.objects_with_deleted.filter(
            manually_deleted=True).values_list("oparl_id", flat=True)
        for i in manually_deleted:
            if (i, ) in json_map:
                del json_map[(i, )]

    # Handle undeleted objects, e.g. papers that disappeared and reappeared
    if issubclass(current_model, DefaultFields):
        deleted = current_model.objects_with_deleted.filter(
            deleted=True, oparl_id__isnull=False).values_list("oparl_id",
                                                              flat=True)
        oparls_ids = [i.get("oparl_id") for i in json_objects]
        to_undelete = set(deleted) & set(oparls_ids)
        if to_undelete:
            logger.info(
                f"{current_model.__name__}: Undeleting {len(to_undelete)}")
            current_model.objects_with_deleted.filter(
                oparl_id__in=to_undelete).update(deleted=False)

    db_ids, db_map = get_from_db(current_model)

    common = set(json_map.keys()) & set(db_map.keys())
    to_be_created = set(json_map.keys()) - common
    to_be_deleted = set(db_map.keys()) - common
    to_be_updated = []
    for existing in common:
        if json_map[existing] != db_map[existing]:
            to_be_updated.append((json_map[existing], db_ids[existing]))

    # We need to delete first and then create to avoid conflicts e.g. when the start of a meeting with an oparl_id
    # changed
    deletion_ids = [db_ids[i1] for i1 in to_be_deleted]
    logger.info(f"{current_model.__name__}: "
                f"Deleting {len(to_be_deleted)}, "
                f"Creating {len(to_be_created)} and "
                f"Updating {len(to_be_updated)}")
    # Since we don't get the bulk created object ids back from django (yet?),
    # we just do this by timestamp - indexing more that necessary isn't wrong anyway
    before_bulk_create = timezone.now()

    if soft_delete:
        deleted_rows = current_model.objects.filter(
            id__in=deletion_ids).update(deleted=True, modified=timezone.now())
    else:
        current_model.objects.filter(id__in=deletion_ids).delete()
        deleted_rows = 0
    # TODO: Delete files

    to_be_created = [current_model(**json_map[i1]) for i1 in to_be_created]
    current_model.objects.bulk_create(to_be_created, batch_size=100)

    # Bulk create doesn't update the search index, so we do this manually
    if settings.ELASTICSEARCH_ENABLED and current_model in registry.get_models(
    ):
        # Changed/Created
        qs = current_model.objects.filter(modified__gte=before_bulk_create)
        qs_count = qs.count()
        assert qs_count >= len(
            to_be_created
        ), f"Only {qs_count} {current_model.__name__} were found for indexing, while at least {len(to_be_created)} were expected"
        logger.info(
            f"Indexing {qs_count} {current_model.__name__} new objects")
        search_bulk_index(current_model, qs)
        # Deleted
        qs = current_model.objects_with_deleted.filter(
            deleted=True, modified__gte=before_bulk_create)
        qs_count = qs.count()
        assert (
            qs_count >= deleted_rows
        ), f"Only {qs_count} {current_model.__name__} for deletion, while at least {deleted_rows} were expected"
        logger.info(
            f"Deleting {qs_count} {current_model.__name__} from elasticsearch")
        search_bulk_index(current_model, qs, action="delete")

    with transaction.atomic():
        for json_object, pk in tqdm(
                to_be_updated,
                disable=not to_be_updated,
                desc=f"Update or create for {current_model.__name__}",
        ):
            current_model.objects_with_deleted.update_or_create(
                pk=pk, defaults=json_object)