def transform(self): xylose_source = self.clean_for_xylose() xylose_issue = Issue(xylose_source) # jid uuid = self.extract_model_instance.uuid self.transform_model_instance['uuid'] = uuid self.transform_model_instance['iid'] = uuid # created self.transform_model_instance['created'] = datetime.now() # updated self.transform_model_instance['updated'] = datetime.now() # unpublish_reason -> vazio # journal acronym = xylose_issue.journal.acronym try: journal = TransformJournal.objects.get(acronym=acronym) except Exception, e: # se não for encontrado, salvamos o code do Issue para processar depois logger.error(u"TransformJournal (acronym: %s) não encontrado!") raise e
def requestissue(config, issue_pid): # Request Issue # https://articlemeta.scielo.org/api/v1/issue/?code=0104-070720190001 uissue = config['articlemeta'][ 'host'] + '/api/v1/issue/?code=%s' % issue_pid logger.info(uissue) xissue = None while xissue is None: try: rissue = requests.get(uissue) xissue = Issue(rissue.json()) except requests.exceptions.Timeout: logger.info('error: %s' % e) print("Timeout - Try again") leave() except requests.exceptions.RequestException as e: logger.info('error: %s' % e) print("Request Error - Check your connection and try again") leave() except json.decoder.JSONDecodeError as e: logger.info('error: %s' % e) print("Request Error - Try again") leave() # Valid Codes list seccode_list = [] if xissue.sections != None: for i, sec in enumerate(list(xissue.sections.values())): for sectext in list(sec.values()): if sectext not in invalid_sec and list( xissue.sections)[i] not in seccode_list: seccode_list.append(list(xissue.sections)[i]) return (xissue, seccode_list)
def process_issues(**context): """Processa uma lista de issues carregadas a partir do resultado de leitura da base MST""" def filter_issues(issues: List[Issue]) -> List[Issue]: """Filtra as issues em formato xylose sempre removendo os press releases e ahead of print""" filters = [ lambda issue: not issue.type == "pressrelease", lambda issue: not issue.type == "ahead", ] for f in filters: issues = list(filter(f, issues)) return issues issues = context["ti"].xcom_pull(task_ids="read_issue_mst") issues = json.loads(issues) issues = [Issue({"issue": data}) for data in issues] issues = filter_issues(issues) issues_as_kernel = [issue_as_kernel(issue) for issue in issues] for issue in issues_as_kernel: _id = issue.pop("_id") response = register_or_update(_id, issue, KERNEL_API_BUNDLES_ENDPOINT)
def ext_issue(code, **ext_params): issue = request.get( "%s/issue" % config.AM_URL_API, params={"collection": config.get("SCIELO_COLLECTION"), "code": code}, ).json() obj_issue = Issue(issue)
def test_issue_has_number_returns(self): issue_json = self.issue_json.copy() issue_json["v32"] = [{"_": "ahead"}] issue = Issue({"issue": issue_json}) _issue = issue_to_kernel(issue) self.assertEqual("ahead", issue.number) self.assertEqual("2448-167X-aop", _issue["id"])
def test_issue_data_to_link_returns_issue_data_to_link_to_journal(self): issue = Issue({"issue": self.issues[-1]}) result = issue_data_to_link(issue) self.assertEqual(result["id"], "1678-4464-2018-v1-n1") self.assertEqual(result["number"], "1") self.assertEqual(result["volume"], "1") self.assertEqual(result["year"], "2018")
def check(self, metadata): """Enriquece e normaliza itens do dicionário ``metadata``, que representa metadados de um fascículo. A estrutura de ``metadata`` é a mesma retornada pelo formato JSON, do ``articlemeta.scielo.org``, conforme exemplo: https://gist.github.com/gustavofonseca/4a5919db8d0027f37522da7d06bfa876 """ metadata_copy = metadata.copy() issue = Issue(metadata_copy) issns = set([ issue.journal.any_issn(priority=u'electronic'), issue.journal.any_issn(priority=u'print'), issue.journal.scielo_issn ]) metadata_copy['code'] = issue.publisher_id metadata_copy['code_title'] = list(issns) metadata_copy['collection'] = issue.collection_acronym metadata_copy['issue_type'] = issue.type metadata_copy['publication_year'] = issue.publication_date[0:4] metadata_copy['publication_date'] = issue.publication_date if not isinstance(issue.data['issue']['processing_date'], datetime): try: metadata_copy['processing_date'] = datetime.strptime( issue.data['issue']['processing_date'], '%Y-%m-%d') except: metadata_copy['processing_date'] = datetime.now() return metadata_copy
def test_should_should_include_electronic_issn(self): self.issue_json["v435"] = [{"t": "ONLIN", "_": "10000-000A"}] issue = Issue({"issue": self.issue_json}) issns = get_journal_issns_from_issue(issue) issns.sort() expected = ["0001-3714", "10000-000A"] self.assertEqual(expected, issns)
def _check_issue_meta(self, metadata): """ This method will check the given metadata and retrieve a new dictionary with some new fields. """ issue = Issue(metadata) issns = set([ issue.journal.any_issn(priority=u'electronic'), issue.journal.any_issn(priority=u'print'), issue.journal.scielo_issn ]) metadata['code'] = issue.publisher_id metadata['code_title'] = list(issns) metadata['collection'] = issue.collection_acronym metadata['issue_type'] = issue.type metadata['publication_year'] = issue.publication_date[0:4] metadata['publication_date'] = issue.publication_date metadata['_shard_id'] = uuid.uuid4().hex try: metadata['processing_date'] = issue.processing_date except: metadata['processing_date'] = datetime.now().date().isoformat() return metadata
def issue(self, code, collection, replace_journal_metadata=True): try: issue = self.client.get_issue(code=code, collection=collection, replace_journal_metadata=True) except self.ARTICLEMETA_THRIFT.ServerError as e: msg = 'Error retrieving issue: %s_%s' % (collection, code) raise ServerError(msg) if not issue: logger.warning('Issue not found for: %s_%s' % (collection, code)) return None jissue = None try: jissue = json.loads(issue) except: msg = 'Fail to load JSON when retrienving document: %s_%s' % ( collection, code) raise ValueError(msg) xissue = Issue(jissue) logger.info('Issue loaded: %s_%s' % (collection, code)) return xissue
def setUp(self): self.issue_json = { "v65": [{ "_": "20190129" }], "v35": [{ "_": "2448-167X" }] } self._issue = Issue({"issue": self.issue_json}) self.issue = issue_to_kernel(self._issue)
def issue(self, code, collection): url = urljoin(self.ARTICLEMETA_URL, self.ISSUE_ENDPOINT) params = {'collection': collection, 'code': code} result = self._do_request(url, params) if not result: return None xresult = Issue(result) return xresult
def test_issue_data_to_link_with_supplement(self): suppl_field_expected = ( ("v131", u"2", "2"), ("v132", u"2", "2"), ("v131", u"0", "0"), ("v132", u"0", "0"), ) data = self.issues[-1] for field, value, expected in suppl_field_expected: with self.subTest(field=field, value=value, expected=expected): data[field] = [{u"_": value}] issue = Issue({"issue": data}) result = issue_data_to_link(issue) self.assertEqual(result["supplement"], expected)
def process_issues(**context): """Processa uma lista de issues carregadas a partir do resultado de leitura da base MST""" issue_json_path = context["ti"].xcom_pull( task_ids="copy_mst_bases_to_work_folder_task", key="issue_json_path") with open(issue_json_path, "r") as f: issues = f.read() logging.info("reading file from %s." % (issue_json_path)) issues = json.loads(issues) issues = [Issue({"issue": data}) for data in issues] issues = filter_issues(issues) issues_as_kernel = [issue_as_kernel(issue) for issue in issues] for issue in issues_as_kernel: _id = issue.pop("_id") register_or_update(_id, issue, KERNEL_API_BUNDLES_ENDPOINT)
def issue(self, code, collection, replace_journal_metadata=True): try: issue = self.client.get_issue(code=code, collection=collection, replace_journal_metadata=True) except: msg = 'Error retrieving issue: %s_%s' % (collection, code) raise ServerError(msg) try: jissue = json.loads(issue) except: msg = 'Fail to load JSON when retrienving document: %s_%s' % ( collection, code) raise ServerError(msg) xissue = Issue(jissue) logger.info('Issue loaded: %s_%s' % (collection, code)) return xissue
def issue(self, code, collection, replace_journal_metadata=True): issue = self.dispatcher('get_issue', code=code, collection=collection, replace_journal_metadata=True) if not issue: logger.info('Issue not found for: %s_%s', collection, code) return None jissue = None try: jissue = json.loads(issue) except: msg = 'Fail to load JSON when retrienving document: %s_%s' % ( collection, code) raise ValueError(msg) xissue = Issue(jissue) logger.info('Issue loaded: %s_%s' % (collection, code)) return xissue
def mount_journals_issues_link(issues: List[dict]) -> dict: """Monta a relação entre os journals e suas issues. Monta um dicionário na estrutura {"journal_id": ["issue_id"]}. Issues do tipo ahead ou pressrelease não são consideradas. É utilizado o campo v35 (issue) para obter o `journal_id` ao qual a issue deve ser relacionada. :param issues: Lista contendo issues extraídas da base MST""" journal_issues = {} issues = [Issue({"issue": data}) for data in issues] issues = filter_issues(issues) for issue in issues: issue_to_link = issue_data_to_link(issue) issue_to_link["order"] = issue.data["issue"]["v36"][0]["_"] journal_id = issue.data.get("issue").get("v35")[0]["_"] journal_issues.setdefault(journal_id, []) if issue_to_link not in journal_issues[journal_id]: journal_issues[journal_id].append(issue_to_link) return journal_issues
def issues_bulk(self, collection=None, issn=None, from_date=None, until_date=None, extra_filter=None, limit=LIMIT): fdate = from_date or DEFAULT_FROM_DATE udate = until_date or datetime.today().isoformat()[:10] for from_date, until_date in dates_pagination(fdate, udate): offset = 0 while True: issues = self.dispatcher('get_issues', collection=collection, issn=issn, from_date=from_date, until_date=until_date, limit=limit, offset=offset, extra_filter=extra_filter) if issues is None: break issues = json.loads(issues).get('objects', []) if len(issues) == 0: break for issue in issues: yield Issue(issue) offset += limit
def issues(self, collection=None, issn=None, from_date=None, until_date=None): params = {'limit': 100} if collection: params['collection'] = collection if issn: params['issn'] = issn fdate = from_date or DEFAULT_FROM_DATE udate = until_date or datetime.today().isoformat()[:10] for from_date, until_date in dates_pagination(fdate, udate): params['from'] = from_date params['until'] = until_date params['offset'] = 0 while True: url = urljoin(self.ARTICLEMETA_URL, self.ISSUES_ENDPOINT) issues = self._do_request(url, params=params) if issues is None: break issues = issues.get('objects', []) if len(issues) == 0: break for issue in issues: yield Issue(issue) params['offset'] += 100
def test_issue_data_to_link_without_number(self): data = self.issues[-1] del data["v32"] issue = Issue({"issue": data}) result = issue_data_to_link(issue) self.assertIsNone(result.get("number"))
def conversion_issues_to_xylose(issues: List[dict]) -> List[Issue]: """Converte uma lista de issues em formato JSON para uma lista de issues em formato xylose""" return [Issue({"issue": issue}) for issue in issues]
def test_should_not_find_bundles_for_journal(self): self.issue_json["v35"] = [{"_": "0001-3714X"}] issues = [Issue({"issue": self.issue_json})] journal_issues = find_documents_bundles(SAMPLE_KERNEL_JOURNAL, issues) self.assertListEqual([], journal_issues)
def test_should_link_journal_and_issues(self): issues = [Issue({"issue": self.issue_json})] journal_issues = find_documents_bundles(SAMPLE_KERNEL_JOURNAL, issues) self.assertEqual([SAMPLE_ISSUES_KERNEL[0]["id"]], journal_issues)
def test_issue_data_to_link_without_supplement(self): issue = Issue({"issue": self.issues[-1]}) result = issue_data_to_link(issue) self.assertIsNone(result.get("supplement"))
def setUp(self): self.issue_json = deepcopy(SAMPLE_ISSUES_JSON[0]) self.basic_issue = Issue({"issue": self.issue_json})
def test_issue_data_to_link_without_volume(self): data = self.issues[-1] del data["v31"] issue = Issue({"issue": data}) result = issue_data_to_link(issue) self.assertIsNone(result.get("volume"))
def test_issue_has_year_in_id_because_it_is_not_aop(self): self.issue_json["v31"] = [{"_": "21"}] self._issue = Issue({"issue": self.issue_json}) self.issue = issue_to_kernel(self._issue) self.assertIn("2019", self.issue["id"]) self.assertIn("2019", self.issue["_id"])