Beispiel #1
0
def register_journals(ds, **kwargs):
    mongo_connect()
    tasks = kwargs["ti"].xcom_pull(key="tasks", task_ids="read_changes_task")

    journal_changes = filter_changes(tasks, "journals", "get")

    # Dictionary with id of journal and list of issues, something like: known_issues[journal_id] = [issue_id, issue_id, ....]
    known_issues = {}

    # Dictionary with id of journal and aop of the jounal, something like:
    # journals_aop = {'journal_id' = 'aop_id', 'journal_id' = 'aop_id', ....}
    journals_aop = {}

    for journal in journal_changes:
        resp_json = fetch_journal(get_id(journal.get("id")))

        t_journal = JournalFactory(resp_json)
        t_journal.save()

        known_issues[get_id(journal.get("id"))] = resp_json.get("items", [])

        if resp_json.get("aop"):
            journals_aop[resp_json.get("aop", "")] = get_id(journal.get("id"))

    kwargs["ti"].xcom_push(key="known_issues", value=known_issues)
    kwargs["ti"].xcom_push(key="journals_aop", value=journals_aop)

    return tasks
Beispiel #2
0
def delete_issues(ds, **kwargs):
    mongo_connect()
    tasks = kwargs["ti"].xcom_pull(key="tasks", task_ids="read_changes_task")

    issue_changes = filter_changes(tasks, "bundles", "delete")

    for issue in issue_changes:

        issue = models.Issue.objects.get(_id=get_id(issue.get("id")))
        issue.is_public = False
        issue.save()

    return tasks
Beispiel #3
0
def register_last_issues(ds, **kwargs):
    mongo_connect()

    for journal in models.Journal.objects.all():
        try:
            logging.info("Id do journal: %s" % journal._id)
            last_j_issue = (
                models.Issue.objects.filter(journal=journal._id)
                .order_by("-year", "-order")
                .first()
                .select_related()
            )

            l_issue_sec = []
            if hasattr(last_j_issue, "sections"):
                l_issue_sec = last_j_issue.sections

            last_issue = {"sections": l_issue_sec}

            if hasattr(last_j_issue, "volume"):
                last_issue["volume"] = last_j_issue.volume

            if hasattr(last_j_issue, "iid"):
                last_issue["iid"] = last_j_issue.iid

            if hasattr(last_j_issue, "number"):
                last_issue["number"] = last_j_issue.number

            if hasattr(last_j_issue, "start_month"):
                last_issue["start_month"] = last_j_issue.start_month

            if hasattr(last_j_issue, "end_month"):
                last_issue["end_month"] = last_j_issue.end_month

            if hasattr(last_j_issue, "label"):
                last_issue["label"] = last_j_issue.label

            if hasattr(last_j_issue, "year"):
                last_issue["year"] = last_j_issue.year

            if hasattr(last_j_issue, "type"):
                last_issue["type"] = last_j_issue.type

            if hasattr(last_j_issue, "suppl_text"):
                last_issue["suppl_text"] = last_j_issue.suppl_text

            journal.last_issue = models.LastIssue(**last_issue)
            journal.save()
        except AttributeError:
            logging.info("No issues are registered to models.Journal: %s " % journal)
Beispiel #4
0
def delete_documents(ds, **kwargs):
    mongo_connect()
    tasks = kwargs["ti"].xcom_pull(key="tasks", task_ids="read_changes_task")

    document_changes = filter_changes(tasks, "documents", "delete")

    for document in document_changes:

        try:
            article = models.Article.objects.get(
                _id=get_id(document.get("id")))
            article.is_public = False
            article.save()
        except models.Article.DoesNotExist:
            logging.info(
                "Could not delete document '%s' "
                "it does not exist in Website database",
                get_id(document.get("id")),
            )

    return tasks
Beispiel #5
0
def register_documents_renditions(**kwargs):
    """Registra as manifestações de documentos processados na base de dados
    do OPAC"""

    mongo_connect()

    tasks = kwargs["ti"].xcom_pull(key="tasks", task_ids="read_changes_task")

    renditions_to_get = itertools.chain(
        Variable.get("orphan_renditions",
                     default_var=[],
                     deserialize_json=True),
        (get_id(task["id"])
         for task in filter_changes(tasks, "renditions", "get")),
    )

    orphans = try_register_documents_renditions(renditions_to_get,
                                                fetch_documents_renditions,
                                                ArticleRenditionFactory)

    Variable.set("orphan_renditions", orphans, serialize_json=True)
Beispiel #6
0
def register_documents(**kwargs):
    """Registra documentos na base de dados do OPAC a partir de
    informações vindas da API do `Kernel`. Armazena como órfãos nas variáveis
    do Airflow os documentos que não puderam ser salvos."""

    mongo_connect()

    tasks = kwargs["ti"].xcom_pull(key="tasks", task_ids="read_changes_task")

    def _get_relation_data(document_id: str) -> Tuple[str, Dict]:
        """Recupera informações sobre o relacionamento entre o
        DocumentsBundle e o Document.

        Retorna uma tupla contendo o identificador da issue onde o
        documento está relacionado e o item do relacionamento.

        >> _get_relation_data("67TH7T7CyPPmgtVrGXhWXVs")
        ('0034-8910-2019-v53', {'id': '67TH7T7CyPPmgtVrGXhWXVs', 'order': '01'})

        :param document_id: Identificador único de um documento
        """

        for issue_id, items in known_documents.items():
            for item in items:
                if document_id == item["id"]:
                    return (issue_id, item)

        return (None, {})

    def _get_known_documents(**kwargs) -> Dict[str, List[str]]:
        """Recupera a lista de todos os documentos que estão relacionados com
        um `DocumentsBundle`.

        Levando em consideração que a DAG que detecta mudanças na API do Kernel
        roda de forma assíncrona em relação a DAG de espelhamento/sincronização.

        É possível que algumas situações especiais ocorram onde em uma rodada
        **anterior** o **evento de registro** de um `Document` foi capturado mas a
        atualização de seu `DocumentsBundle` não ocorreu (elas ocorrem em transações
        distintas e possuem timestamps também distintos). O documento será
        registrado como **órfão** e sua `task` não será processada na próxima
        execução.

        Na próxima execução a task `register_issue_task` entenderá que o
        `bundle` é órfão e não conhecerá os seus documentos (known_documents)
        e consequentemente o documento continuará órfão.

        Uma solução para este problema é atualizar a lista de documentos
        conhecidos a partir da lista de eventos de `get` de `bundles`.
        """

        known_documents = kwargs["ti"].xcom_pull(
            key="i_documents", task_ids="register_issues_task")

        issues_recently_updated = [
            get_id(task["id"])
            for task in filter_changes(tasks, "bundles", "get")
            if known_documents.get(get_id(task["id"])) is None
        ]

        for issue_id in issues_recently_updated:
            known_documents.setdefault(issue_id, [])
            known_documents[issue_id] = list(
                itertools.chain(known_documents[issue_id],
                                fetch_bundles(issue_id).get("items", [])))
        return known_documents

    known_documents = _get_known_documents(**kwargs)

    # TODO: Em caso de um update no document é preciso atualizar o registro
    # Precisamos de uma nova task?

    documents_to_get = itertools.chain(
        Variable.get("orphan_documents", default_var=[],
                     deserialize_json=True),
        (get_id(task["id"])
         for task in filter_changes(tasks, "documents", "get")),
    )

    orphans = try_register_documents(documents_to_get, _get_relation_data,
                                     fetch_documents_front, ArticleFactory)

    Variable.set("orphan_documents", orphans, serialize_json=True)
Beispiel #7
0
def IssueFactory(data, journal_id, issue_order=None, _type="regular"):
    """
    Realiza o registro fascículo utilizando o opac schema.

    Esta função pode lançar a exceção `models.Journal.DoesNotExist`.

    Para satisfazer a obrigatoriedade do ano para os "Fascículos" ahead, estamos fixando o ano de fascículos do tipo ``ahead`` com o valor 9999
    """
    mongo_connect()

    metadata = data["metadata"]

    try:
        issue = models.Issue.objects.get(_id=data["id"])
    except models.Issue.DoesNotExist:
        issue = models.Issue()
    else:
        journal_id = journal_id or issue.journal._id
        _type = "ahead" if _type == "ahead" or data["id"].endswith(
            "-aop") else _type

    issue._id = issue.iid = data["id"]
    issue.spe_text = metadata.get("spe_text", "")
    issue.start_month = metadata.get("publication_months", {
        "range": [0, 0]
    }).get("range", [0])[0]
    issue.end_month = metadata.get("publication_months", {
        "range": [0, 0]
    }).get("range", [0])[-1]

    if _type == "ahead":
        issue.year = issue.year or "9999"
        issue.number = issue.number or "ahead"
    else:
        issue.year = metadata.get("publication_year", issue.year)
        issue.number = metadata.get("number", issue.number)

    issue.volume = metadata.get("volume", "")

    if issue_order:
        issue.order = issue_order

    issue.pid = metadata.get("pid", "")
    issue.journal = models.Journal.objects.get(_id=journal_id)

    def _get_issue_label(metadata: dict) -> str:
        """Produz o label esperado pelo OPAC de acordo com as regras aplicadas
        pelo OPAC Proc e Xylose.

        Args:
            metadata (dict): conteúdo de um bundle

        Returns:
            str: label produzido a partir de um bundle
        """

        START_REGEX = re.compile("^0")
        END_REGEX = re.compile("0$")

        label_number = metadata.get("number", "")
        label_volume = metadata.get("volume", "")
        label_supplement = (" suppl %s" % metadata.get("supplement", "")
                            if metadata.get("supplement", "") else "")

        if label_number:
            label_number += label_supplement
            label_number = START_REGEX.sub("", label_number)
            label_number = END_REGEX.sub("", label_number)
            label_number = label_number.strip()

        return "".join(["v" + label_volume, "n" + label_number])

    issue.label = _get_issue_label(metadata)

    if metadata.get("supplement"):
        issue.suppl_text = metadata.get("supplement")
        issue.type = "supplement"
    elif issue.volume and not issue.number:
        issue.type = "volume_issue"
    elif issue.number and "spe" in issue.number:
        issue.type = "special"
    elif _type == "ahead" and not data.get("items"):
        """
        Caso não haja nenhum artigo no bundle de ahead, ele é definido como
        ``outdated_ahead``, para que não apareça na grade de fascículos
        """
        issue.type = "outdated_ahead"
    else:
        issue.type = _type

    issue.created = data.get("created", "")
    issue.updated = data.get("updated", "")

    return issue
Beispiel #8
0
def IssueFactory(data, journal_id, issue_order):
    """
    Realiza o registro fascículo utilizando o opac schema.

    Esta função pode lançar a exceção `models.Journal.DoesNotExist`.
    """
    mongo_connect()

    metadata = data["metadata"]

    issue = models.Issue()
    issue._id = issue.iid = data.get("id")
    issue.type = metadata.get("type", "regular")
    issue.spe_text = metadata.get("spe_text", "")
    issue.start_month = metadata.get("publication_month", 0)
    issue.end_month = metadata.get("publication_season", [0])[-1]
    issue.year = metadata.get("publication_year")
    issue.volume = metadata.get("volume", "")
    issue.number = metadata.get("number", "")
    issue.order = metadata.get("order", 0)
    issue.pid = metadata.get("pid", "")
    issue.journal = models.Journal.objects.get(_id=journal_id)
    issue.order = issue_order

    def _get_issue_label(metadata: dict) -> str:
        """Produz o label esperado pelo OPAC de acordo com as regras aplicadas
        pelo OPAC Proc e Xylose.
        
        Args:
            metadata (dict): conteúdo de um bundle
        
        Returns:
            str: label produzido a partir de um bundle
        """

        START_REGEX = re.compile("^0")
        END_REGEX = re.compile("0$")

        label_number = metadata.get("number", "")
        label_volume = metadata.get("volume", "")
        label_supplement = (
            " suppl %s" % metadata.get("supplement", "")
            if metadata.get("supplement", "")
            else ""
        )

        if label_number:
            label_number += label_supplement
            label_number = START_REGEX.sub("", label_number)
            label_number = END_REGEX.sub("", label_number)
            label_number = label_number.strip()

        return "".join(["v" + label_volume, "n" + label_number])

    issue.label = _get_issue_label(metadata)

    if metadata.get("supplement"):
        issue.suppl_text = metadata.get("supplement")
        issue.type = "supplement"
    elif issue.volume and not issue.number:
        issue.type = "volume_issue"
    elif issue.number and "spe" in issue.number:
        issue.type = "special"

    return issue