def _transform_latest(cls, document: Document) -> Optional[dict]: latest = document.get('latest') if latest is None: return None return { "paper_id": latest, "href": url_for("api.paper", paper_id=document['paper_id'], version=document.get('latest_version'), _external=True), "canonical": url_for("abs", paper_id=document['paper_id'], version=document.get('latest_version')), "version": document.get('latest_version') }
def test_paper_has_one_version(self, mock_meta, mock_tx, mock_idx, mock_client_factory): """The arXiv paper has only one version.""" mock_client = mock.MagicMock() mock_waiter = mock.MagicMock() mock_client.get_waiter.return_value = mock_waiter mock_client_factory.return_value = mock_client processor = consumer.MetadataRecordProcessor(*self.args) mock_docmeta = DocMeta( version=1, paper_id="1234.56789", title="foo", submitted_date="2001-03-02T03:04:05-400", ) mock_meta.retrieve.return_value = mock_docmeta mock_meta.bulk_retrieve.return_value = [mock_docmeta] mock_doc = Document( version=1, paper_id="1234.56789", title="foo", submitted_date=["2001-03-02T03:04:05-400"], ) mock_tx.to_search_document.return_value = mock_doc processor.index_paper("1234.56789") mock_idx.bulk_add_documents.assert_called_once_with([mock_doc])
def test_index_returns_result(self, mock_index): """Test returns 'OK' + status 200 when index returns results.""" mock_index.search.return_value = DocumentSet({}, [Document()]) response, status_code, _ = health_check() self.assertEqual(response, 'OK', "Response content should be OK") self.assertEqual(status_code, status.HTTP_200_OK, "Should return 200 status code.")
def transform_document(cls, doc: Document, query: Optional[APIQuery] = None) -> dict: """Select a subset of :class:`Document` properties for public API.""" # Only return fields that have been explicitly requested. data = {key: value for key, value in doc.items() if query is None or key in query.include_fields} paper_id = doc['paper_id'] version = doc['version'] if 'submitted_date_first' in data: data['submitted_date_first'] = \ doc['submitted_date_first'].isoformat() if 'announced_date_first' in data: data['announced_date_first'] = \ doc['announced_date_first'].isoformat() if 'formats' in data: data['formats'] = [cls._transform_format(fmt, paper_id, version) for fmt in doc['formats']] if 'license' in data: data['license'] = cls._transform_license(doc['license']) if 'latest' in data: data['latest'] = cls._transform_latest(doc) data['href'] = url_for("api.paper", paper_id=paper_id, version=version, _external=True) data['canonical'] = url_for("abs", paper_id=paper_id, version=version) return data
def transform_document( self, doc: Document, query: Optional[APIQuery] = None ) -> Dict[str, Any]: """Select a subset of :class:`Document` properties for public API.""" # Only return fields that have been explicitly requested. data = { key: value for key, value in doc.items() if query is None or key in query.include_fields } paper_id = doc["paper_id"] version = doc["version"] if "submitted_date_first" in data: data["submitted_date_first"] = doc[ "submitted_date_first" ].isoformat() if "announced_date_first" in data: data["announced_date_first"] = doc[ "announced_date_first" ].isoformat() if "formats" in data: data["formats"] = [ self._transform_format(fmt, paper_id, version) for fmt in doc["formats"] ] if "license" in data: data["license"] = self._transform_license(doc["license"]) if "latest" in data: data["latest"] = self._transform_latest(doc) data["href"] = url_for( "api.paper", paper_id=paper_id, version=version, _external=True ) data["canonical"] = url_for("abs", paper_id=paper_id, version=version) return data
def get_document(self, document_id: int) -> Document: """ Retrieve a document from the index by ID. Uses ``metadata_id`` as the primary identifier for the document. Parameters ---------- doument_id : int Value of ``metadata_id`` in the original document. Returns ------- :class:`.Document` Raises ------ IndexConnectionError Problem communicating with the search index. QueryError Invalid query parameters. """ with handle_es_exceptions(): record = self.es.get(index=self.index, doc_type=self.doc_type, id=document_id) if not record: logger.error("No such document: %s", document_id) raise DocumentNotFound('No such document') return Document(**record['_source']) # type: ignore
def _transform_latest(document: Document) -> Optional[Dict[str, str]]: latest = document.get("latest") if latest is None: return None return { # type:ignore "paper_id": latest, "href": url_for( "api.paper", paper_id=document["paper_id"], version=document.get("latest_version"), _external=True, ), "canonical": url_for( "abs", paper_id=document["paper_id"], version=document.get("latest_version"), ), "version": document.get("latest_version"), }
def test_index_raises_unhandled_error(self, mock_index, mock_client_factory): """The index raises an unhandled exception.""" mock_client = mock.MagicMock() mock_waiter = mock.MagicMock() mock_client.get_waiter.return_value = mock_waiter mock_client_factory.return_value = mock_client processor = consumer.MetadataRecordProcessor(*self.args) mock_index.bulk_add_documents.side_effect = RuntimeError with self.assertRaises(consumer.IndexingFailed): processor._bulk_add_to_index([Document()])
def test_index_raises_index_connection_error(self, mock_index, mock_client_factory): """The index raises :class:`.index.IndexConnectionError`.""" mock_client = mock.MagicMock() mock_waiter = mock.MagicMock() mock_client.get_waiter.return_value = mock_waiter mock_client_factory.return_value = mock_client processor = consumer.MetadataRecordProcessor(*self.args) mock_index.bulk_add_documents.side_effect = index.IndexConnectionError with self.assertRaises(consumer.IndexingFailed): processor._bulk_add_to_index([Document()])
def test_add_document_succeeds(self, mock_index, mock_client_factory): """The search document is added successfully.""" mock_client = mock.MagicMock() mock_waiter = mock.MagicMock() mock_client.get_waiter.return_value = mock_waiter mock_client_factory.return_value = mock_client processor = consumer.MetadataRecordProcessor(*self.args) try: processor._add_to_index(Document()) except Exception as ex: self.fail(ex) mock_index.add_document.assert_called_once()
def to_document(raw: Union[Hit, dict], highlight: bool = True) -> Document: """Transform an ES search result back into a :class:`.Document`.""" # typing: ignore result: Dict[str, Any] = {} result['match'] = {} # Hit on field, but no highlighting. result['truncated'] = {} # Preview is truncated. for key in Document.fields(): if type(raw) is Hit: if not hasattr(raw, key): continue value = getattr(raw, key) elif type(raw) is dict: if key not in raw: continue value = raw.get(key) else: continue # We want to prevent ES-specific data types from escaping the module # API. if isinstance(value, AttrList): value = value._l_ elif isinstance(value, AttrDict): value = value.to_dict() if key == 'primary_classification': value = Classification(**value) # type: ignore elif key == 'secondary_classification': value = [Classification(**v) for v in value] # type: ignore elif key in ['authors', 'owners']: value = [_to_author(au) for au in value] elif key == 'submitter': value = _to_author(value) elif key == 'announced_date_first' and \ value and isinstance(value, str): value = datetime.strptime(value, '%Y-%m').date() elif key in [ 'submitted_date', 'submitted_date_first', 'submitted_date_latest' ]: try: value = datetime.strptime(value, '%Y-%m-%dT%H:%M:%S%z') except (ValueError, TypeError): logger.warning(f'Could not parse {key}: {value} as datetime') pass elif key in ['acm_class', 'msc_class'] and value: value = '; '.join(value) result[key] = value if type(raw) is Response: result['score'] = raw.meta.score # type: ignore if type(result.get('abstract')) is str and highlight: if 'preview' not in result: result['preview'] = {} result['preview']['abstract'] = preview(result['abstract']) if result['preview']['abstract'].endswith('…'): result['truncated']['abstract'] = True if highlight and type(raw) in [Response, Hit]: result['highlight'] = {} logger.debug('%s: add highlighting to result', raw.paper_id) # type: ignore result = add_highlighting(result, raw) return Document(**result) # type: ignore
def test_paper_has_three_versions(self, mock_meta, mock_tx, mock_idx, mock_client_factory): """The arXiv paper has three versions.""" mock_client = mock.MagicMock() mock_waiter = mock.MagicMock() mock_client.get_waiter.return_value = mock_waiter mock_client_factory.return_value = mock_client processor = consumer.MetadataRecordProcessor(*self.args) mock_dm_1 = DocMeta( version=1, paper_id="1234.56789", title="foo", submitted_date="2001-03-02T03:04:05-400", ) mock_dm_2 = DocMeta( version=2, paper_id="1234.56789", title="foo", submitted_date="2001-03-03T03:04:05-400", ) mock_dm_3 = DocMeta( version=3, paper_id="1234.56789", title="foo", submitted_date="2001-03-04T03:04:05-400", ) mock_meta.retrieve.side_effect = [mock_dm_3, mock_dm_1, mock_dm_2] mock_meta.bulk_retrieve.return_value = [ mock_dm_3, mock_dm_1, mock_dm_2, mock_dm_3, ] mock_doc_1 = Document( version=1, paper_id="1234.56789", title="foo", submitted_date=["2001-03-02T03:04:05-400"], submitted_date_all=["2001-03-02T03:04:05-400"], ) mock_doc_2 = Document( version=2, paper_id="1234.56789", title="foo", submitted_date=["2001-03-03T03:04:05-400"], submitted_date_all=[ "2001-03-02T03:04:05-400", "2001-03-03T03:04:05-400", ], ) mock_doc_3 = Document( version=3, paper_id="1234.56789", title="foo", submitted_date=["2001-03-04T03:04:05-400"], submitted_date_all=[ "2001-03-02T03:04:05-400", "2001-03-03T03:04:05-400", "2001-03-04T03:04:05-400", ], ) mock_tx.to_search_document.side_effect = [ mock_doc_3, mock_doc_1, mock_doc_2, mock_doc_3, ] processor.index_paper("1234.56789") self.assertEqual( mock_meta.bulk_retrieve.call_count, 1, "Metadata should be retrieved for current version" " with bulk_retrieve", ) self.assertEqual( mock_meta.retrieve.call_count, 0, "Metadata should be retrieved for each non-current" " version", ) mock_idx.bulk_add_documents.assert_called_once_with( [mock_doc_3, mock_doc_1, mock_doc_2, mock_doc_3])
def transform_document( cls, fg: FeedGenerator, doc: Document, query: Optional[ClassicAPIQuery] = None, ) -> None: """Select a subset of :class:`Document` properties for public API.""" entry = fg.add_entry() entry.id( url_for( "abs", paper_id=doc["paper_id"], version=doc["version"], _external=True, )) entry.title(doc["title"]) entry.summary(doc["abstract"]) entry.published(to_utc(doc["submitted_date"])) entry.updated(to_utc(doc["updated_date"])) entry.link({ "href": url_for( "abs", paper_id=doc["paper_id"], version=doc["version"], _external=True, ), "type": "text/html", }) entry.link({ "href": url_for( "pdf", paper_id=doc["paper_id"], version=doc["version"], _external=True, ), "type": "application/pdf", "rel": "related", "title": "pdf", }) if doc.get("comments"): entry.arxiv.comment(doc["comments"]) if doc.get("journal_ref"): entry.arxiv.journal_ref(doc["journal_ref"]) if doc.get("doi"): entry.arxiv.doi(doc["doi"]) if doc["primary_classification"]["category"] is not None: entry.arxiv.primary_category( doc["primary_classification"]["category"]["id"]) entry.category( term=doc["primary_classification"]["category"]["id"], scheme=ARXIV_NS, ) for category in doc["secondary_classification"]: entry.category(term=category["category"]["id"], scheme=ARXIV_NS) for author in doc["authors"]: author_data: Dict[str, Any] = {"name": author["full_name"]} if author.get("affiliation"): author_data["affiliation"] = author["affiliation"] entry.arxiv.author(author_data)