def register_journals(ds, **kwargs): mongo_connect() tasks = kwargs["ti"].xcom_pull(key="tasks", task_ids="read_changes_task") journal_changes = filter_changes(tasks, "journals", "get") # Dictionary with id of journal and list of issues, something like: known_issues[journal_id] = [issue_id, issue_id, ....] known_issues = {} # Dictionary with id of journal and aop of the jounal, something like: # journals_aop = {'journal_id' = 'aop_id', 'journal_id' = 'aop_id', ....} journals_aop = {} for journal in journal_changes: resp_json = fetch_journal(get_id(journal.get("id"))) t_journal = JournalFactory(resp_json) t_journal.save() known_issues[get_id(journal.get("id"))] = resp_json.get("items", []) if resp_json.get("aop"): journals_aop[resp_json.get("aop", "")] = get_id(journal.get("id")) kwargs["ti"].xcom_push(key="known_issues", value=known_issues) kwargs["ti"].xcom_push(key="journals_aop", value=journals_aop) return tasks
def delete_issues(ds, **kwargs): mongo_connect() tasks = kwargs["ti"].xcom_pull(key="tasks", task_ids="read_changes_task") issue_changes = filter_changes(tasks, "bundles", "delete") for issue in issue_changes: issue = models.Issue.objects.get(_id=get_id(issue.get("id"))) issue.is_public = False issue.save() return tasks
def register_last_issues(ds, **kwargs): mongo_connect() for journal in models.Journal.objects.all(): try: logging.info("Id do journal: %s" % journal._id) last_j_issue = ( models.Issue.objects.filter(journal=journal._id) .order_by("-year", "-order") .first() .select_related() ) l_issue_sec = [] if hasattr(last_j_issue, "sections"): l_issue_sec = last_j_issue.sections last_issue = {"sections": l_issue_sec} if hasattr(last_j_issue, "volume"): last_issue["volume"] = last_j_issue.volume if hasattr(last_j_issue, "iid"): last_issue["iid"] = last_j_issue.iid if hasattr(last_j_issue, "number"): last_issue["number"] = last_j_issue.number if hasattr(last_j_issue, "start_month"): last_issue["start_month"] = last_j_issue.start_month if hasattr(last_j_issue, "end_month"): last_issue["end_month"] = last_j_issue.end_month if hasattr(last_j_issue, "label"): last_issue["label"] = last_j_issue.label if hasattr(last_j_issue, "year"): last_issue["year"] = last_j_issue.year if hasattr(last_j_issue, "type"): last_issue["type"] = last_j_issue.type if hasattr(last_j_issue, "suppl_text"): last_issue["suppl_text"] = last_j_issue.suppl_text journal.last_issue = models.LastIssue(**last_issue) journal.save() except AttributeError: logging.info("No issues are registered to models.Journal: %s " % journal)
def delete_documents(ds, **kwargs): mongo_connect() tasks = kwargs["ti"].xcom_pull(key="tasks", task_ids="read_changes_task") document_changes = filter_changes(tasks, "documents", "delete") for document in document_changes: try: article = models.Article.objects.get( _id=get_id(document.get("id"))) article.is_public = False article.save() except models.Article.DoesNotExist: logging.info( "Could not delete document '%s' " "it does not exist in Website database", get_id(document.get("id")), ) return tasks
def register_documents_renditions(**kwargs): """Registra as manifestações de documentos processados na base de dados do OPAC""" mongo_connect() tasks = kwargs["ti"].xcom_pull(key="tasks", task_ids="read_changes_task") renditions_to_get = itertools.chain( Variable.get("orphan_renditions", default_var=[], deserialize_json=True), (get_id(task["id"]) for task in filter_changes(tasks, "renditions", "get")), ) orphans = try_register_documents_renditions(renditions_to_get, fetch_documents_renditions, ArticleRenditionFactory) Variable.set("orphan_renditions", orphans, serialize_json=True)
def register_documents(**kwargs): """Registra documentos na base de dados do OPAC a partir de informações vindas da API do `Kernel`. Armazena como órfãos nas variáveis do Airflow os documentos que não puderam ser salvos.""" mongo_connect() tasks = kwargs["ti"].xcom_pull(key="tasks", task_ids="read_changes_task") def _get_relation_data(document_id: str) -> Tuple[str, Dict]: """Recupera informações sobre o relacionamento entre o DocumentsBundle e o Document. Retorna uma tupla contendo o identificador da issue onde o documento está relacionado e o item do relacionamento. >> _get_relation_data("67TH7T7CyPPmgtVrGXhWXVs") ('0034-8910-2019-v53', {'id': '67TH7T7CyPPmgtVrGXhWXVs', 'order': '01'}) :param document_id: Identificador único de um documento """ for issue_id, items in known_documents.items(): for item in items: if document_id == item["id"]: return (issue_id, item) return (None, {}) def _get_known_documents(**kwargs) -> Dict[str, List[str]]: """Recupera a lista de todos os documentos que estão relacionados com um `DocumentsBundle`. Levando em consideração que a DAG que detecta mudanças na API do Kernel roda de forma assíncrona em relação a DAG de espelhamento/sincronização. É possível que algumas situações especiais ocorram onde em uma rodada **anterior** o **evento de registro** de um `Document` foi capturado mas a atualização de seu `DocumentsBundle` não ocorreu (elas ocorrem em transações distintas e possuem timestamps também distintos). O documento será registrado como **órfão** e sua `task` não será processada na próxima execução. Na próxima execução a task `register_issue_task` entenderá que o `bundle` é órfão e não conhecerá os seus documentos (known_documents) e consequentemente o documento continuará órfão. Uma solução para este problema é atualizar a lista de documentos conhecidos a partir da lista de eventos de `get` de `bundles`. """ known_documents = kwargs["ti"].xcom_pull( key="i_documents", task_ids="register_issues_task") issues_recently_updated = [ get_id(task["id"]) for task in filter_changes(tasks, "bundles", "get") if known_documents.get(get_id(task["id"])) is None ] for issue_id in issues_recently_updated: known_documents.setdefault(issue_id, []) known_documents[issue_id] = list( itertools.chain(known_documents[issue_id], fetch_bundles(issue_id).get("items", []))) return known_documents known_documents = _get_known_documents(**kwargs) # TODO: Em caso de um update no document é preciso atualizar o registro # Precisamos de uma nova task? documents_to_get = itertools.chain( Variable.get("orphan_documents", default_var=[], deserialize_json=True), (get_id(task["id"]) for task in filter_changes(tasks, "documents", "get")), ) orphans = try_register_documents(documents_to_get, _get_relation_data, fetch_documents_front, ArticleFactory) Variable.set("orphan_documents", orphans, serialize_json=True)
def IssueFactory(data, journal_id, issue_order=None, _type="regular"): """ Realiza o registro fascículo utilizando o opac schema. Esta função pode lançar a exceção `models.Journal.DoesNotExist`. Para satisfazer a obrigatoriedade do ano para os "Fascículos" ahead, estamos fixando o ano de fascículos do tipo ``ahead`` com o valor 9999 """ mongo_connect() metadata = data["metadata"] try: issue = models.Issue.objects.get(_id=data["id"]) except models.Issue.DoesNotExist: issue = models.Issue() else: journal_id = journal_id or issue.journal._id _type = "ahead" if _type == "ahead" or data["id"].endswith( "-aop") else _type issue._id = issue.iid = data["id"] issue.spe_text = metadata.get("spe_text", "") issue.start_month = metadata.get("publication_months", { "range": [0, 0] }).get("range", [0])[0] issue.end_month = metadata.get("publication_months", { "range": [0, 0] }).get("range", [0])[-1] if _type == "ahead": issue.year = issue.year or "9999" issue.number = issue.number or "ahead" else: issue.year = metadata.get("publication_year", issue.year) issue.number = metadata.get("number", issue.number) issue.volume = metadata.get("volume", "") if issue_order: issue.order = issue_order issue.pid = metadata.get("pid", "") issue.journal = models.Journal.objects.get(_id=journal_id) def _get_issue_label(metadata: dict) -> str: """Produz o label esperado pelo OPAC de acordo com as regras aplicadas pelo OPAC Proc e Xylose. Args: metadata (dict): conteúdo de um bundle Returns: str: label produzido a partir de um bundle """ START_REGEX = re.compile("^0") END_REGEX = re.compile("0$") label_number = metadata.get("number", "") label_volume = metadata.get("volume", "") label_supplement = (" suppl %s" % metadata.get("supplement", "") if metadata.get("supplement", "") else "") if label_number: label_number += label_supplement label_number = START_REGEX.sub("", label_number) label_number = END_REGEX.sub("", label_number) label_number = label_number.strip() return "".join(["v" + label_volume, "n" + label_number]) issue.label = _get_issue_label(metadata) if metadata.get("supplement"): issue.suppl_text = metadata.get("supplement") issue.type = "supplement" elif issue.volume and not issue.number: issue.type = "volume_issue" elif issue.number and "spe" in issue.number: issue.type = "special" elif _type == "ahead" and not data.get("items"): """ Caso não haja nenhum artigo no bundle de ahead, ele é definido como ``outdated_ahead``, para que não apareça na grade de fascículos """ issue.type = "outdated_ahead" else: issue.type = _type issue.created = data.get("created", "") issue.updated = data.get("updated", "") return issue
def IssueFactory(data, journal_id, issue_order): """ Realiza o registro fascículo utilizando o opac schema. Esta função pode lançar a exceção `models.Journal.DoesNotExist`. """ mongo_connect() metadata = data["metadata"] issue = models.Issue() issue._id = issue.iid = data.get("id") issue.type = metadata.get("type", "regular") issue.spe_text = metadata.get("spe_text", "") issue.start_month = metadata.get("publication_month", 0) issue.end_month = metadata.get("publication_season", [0])[-1] issue.year = metadata.get("publication_year") issue.volume = metadata.get("volume", "") issue.number = metadata.get("number", "") issue.order = metadata.get("order", 0) issue.pid = metadata.get("pid", "") issue.journal = models.Journal.objects.get(_id=journal_id) issue.order = issue_order def _get_issue_label(metadata: dict) -> str: """Produz o label esperado pelo OPAC de acordo com as regras aplicadas pelo OPAC Proc e Xylose. Args: metadata (dict): conteúdo de um bundle Returns: str: label produzido a partir de um bundle """ START_REGEX = re.compile("^0") END_REGEX = re.compile("0$") label_number = metadata.get("number", "") label_volume = metadata.get("volume", "") label_supplement = ( " suppl %s" % metadata.get("supplement", "") if metadata.get("supplement", "") else "" ) if label_number: label_number += label_supplement label_number = START_REGEX.sub("", label_number) label_number = END_REGEX.sub("", label_number) label_number = label_number.strip() return "".join(["v" + label_volume, "n" + label_number]) issue.label = _get_issue_label(metadata) if metadata.get("supplement"): issue.suppl_text = metadata.get("supplement") issue.type = "supplement" elif issue.volume and not issue.number: issue.type = "volume_issue" elif issue.number and "spe" in issue.number: issue.type = "special" return issue