def test_license(self): """Field ``license`` is populated from ``license``.""" _license = { "label": "arXiv.org perpetual, non-exclusive license to" " distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/" } meta = DocMeta(**{'paper_id': '1234.56789', 'license': _license}) doc = transform.to_search_document(meta) self.assertEqual(doc.license['uri'], _license['uri']) self.assertEqual(doc.license['label'], _license['label']) meta = DocMeta(**{ 'paper_id': '1234.56789', 'license': { 'uri': None, 'label': None } }) doc = transform.to_search_document(meta) self.assertEqual(doc.license['uri'], transform.DEFAULT_LICENSE['uri'], "The default license should be used") self.assertEqual(doc.license['label'], transform.DEFAULT_LICENSE['label'], "The default license should be used")
def test_license(self): """Field ``license`` is populated from ``license``.""" _license = { "label": "arXiv.org perpetual, non-exclusive license to" " distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", } meta = DocMeta(**{"paper_id": "1234.56789", "license": _license}) doc = transform.to_search_document(meta) self.assertEqual(doc["license"]["uri"], _license["uri"]) self.assertEqual(doc["license"]["label"], _license["label"]) meta = DocMeta(**{ "paper_id": "1234.56789", "license": { "uri": None, "label": None }, }) doc = transform.to_search_document(meta) self.assertEqual( doc["license"]["uri"], transform.DEFAULT_LICENSE["uri"], "The default license should be used", ) self.assertEqual( doc["license"]["label"], transform.DEFAULT_LICENSE["label"], "The default license should be used", )
def from_cache(cache_dir: str, arxiv_id: str) -> List[DocMeta]: """ Get the docmeta document from a local cache, if available. Parameters ---------- arxiv_id : str Returns ------- :class:`.DocMeta` Raises ------ RuntimeError Raised when the cache is not available, or the document could not be found in the cache. """ fname = '%s.json' % arxiv_id.replace('/', '_') cache_path = os.path.join(cache_dir, fname) if not os.path.exists(cache_path): raise RuntimeError('No cached document') with open(cache_path) as f: data: dict = json.load(f) return [DocMeta(**datum) for datum in data] # type: ignore
def from_cache(cache_dir: str, arxiv_id: str) -> List[DocMeta]: """ Get the docmeta document from a local cache, if available. Parameters ---------- arxiv_id : str Returns ------- :class:`.DocMeta` or None if document is not found in cache """ try: if not cache_dir: return [] # caching is disabled fname = "%s.json" % arxiv_id.replace("/", "_") cache_path = os.path.join(cache_dir, fname) if not os.path.exists(cache_path): raise RuntimeError("No cached document") with open(cache_path) as f: data: dict = json.load(f) return [DocMeta(**datum) for datum in data] # type: ignore # See https://github.com/python/mypy/issues/3937 except RuntimeError: return []
def bulk_retrieve(self, document_ids: List[str]) -> List[DocMeta]: """ Retrieve metadata for an arXiv paper. Parameters ---------- document_ids : List[str] Returns ------- dict Raises ------ IOError ValueError """ if not document_ids: # This could use further elaboration. raise ValueError('Invalid value for document_ids') query_string = '/docmeta_bulk?' + '&'.join( f'id={document_id}' for document_id in document_ids ) try: target = urljoin(self.endpoint, query_string) logger.debug( f'{document_ids}: retrieve metadata from {target} with SSL' f' verify {self._verify_cert}' ) response = self._session.get(target, verify=self._verify_cert) except requests.exceptions.SSLError as e: logger.error('SSLError: %s', e) raise SecurityException('SSL failed: %s' % e) from e except requests.exceptions.ConnectionError as e: logger.error('ConnectionError: %s', e) raise ConnectionFailed( 'Could not connect to metadata service: %s' % e ) from e if response.status_code not in \ [status.HTTP_200_OK, status.HTTP_206_PARTIAL_CONTENT]: logger.error('Request failed: %s', response.content) raise RequestFailed( '%s: failed with %i: %s' % ( document_ids, response.status_code, response.content ) ) logger.debug(f'{document_ids}: response OK') try: resp = response.json() # A list with metadata for each paper. data: List[DocMeta] data = [DocMeta(**value) for value in resp] # type: ignore except json.decoder.JSONDecodeError as e: logger.error('JSONDecodeError: %s', e) raise BadResponse( '%s: could not decode response: %s' % (document_ids, e) ) from e logger.debug(f'{document_ids}: response decoded; done!') return data
def test_transform(self): """All of the paper ID and version fields should be set correctly.""" with open('tests/data/docmeta_bulk.json') as f: data = json.load(f) docmeta = [DocMeta(**datum) for datum in data] documents = [transform.to_search_document(meta) for meta in docmeta] for doc in documents: self.assertIsNotNone(doc.id) self.assertGreater(len(doc.id), 0) self.assertIsNotNone(doc.paper_id) self.assertGreater(len(doc.paper_id), 0) self.assertNotIn('v', doc.paper_id) self.assertIsNotNone(doc.paper_id_v) self.assertGreater(len(doc.paper_id_v), 0) self.assertIn('v', doc.paper_id_v) self.assertIsNotNone(doc.version) self.assertGreater(doc.version, 0) if doc.version == 2: self.assertEqual(doc.latest, f"{doc.paper_id}v2") self.assertTrue(doc.is_current) self.assertEqual(doc.id, doc.paper_id_v) else: self.assertFalse(doc.is_current) self.assertEqual(doc.id, doc.paper_id_v) self.assertEqual(doc.latest_version, 2)
def test_transform(self): """All of the paper ID and version fields should be set correctly.""" with open("tests/data/docmeta_bulk.json") as f: data = json.load(f) docmeta = [DocMeta(**datum) for datum in data] documents = [transform.to_search_document(meta) for meta in docmeta] for doc in documents: self.assertIsNotNone(doc["id"]) self.assertGreater(len(doc["id"]), 0) self.assertIsNotNone(doc["paper_id"]) self.assertGreater(len(doc["paper_id"]), 0) self.assertNotIn("v", doc["paper_id"]) self.assertIsNotNone(doc["paper_id_v"]) self.assertGreater(len(doc["paper_id_v"]), 0) self.assertIn("v", doc["paper_id_v"]) self.assertIsNotNone(doc["version"]) self.assertGreater(doc["version"], 0) if doc["version"] == 2: self.assertEqual(doc["latest"], f"{doc['paper_id']}v2") self.assertTrue(doc["is_current"]) self.assertEqual(doc["id"], doc["paper_id_v"]) else: self.assertFalse(doc["is_current"]) self.assertEqual(doc["id"], doc["paper_id_v"]) self.assertEqual(doc["latest_version"], 2)
def test_submitted_date_all(self): """``submitted_date_all`` is populated from ``submitted_date_all``.""" meta = DocMeta( **{ "paper_id": "1234.56789", "submitted_date_all": [ "2007-04-25T15:58:28-0400", "2007-04-25T16:06:50-0400", ], "is_current": True, }) doc = transform.to_search_document(meta) self.assertEqual(doc["submitted_date_all"][0], "2007-04-25T15:58:28-0400") self.assertEqual(doc["submitted_date_all"][1], "2007-04-25T16:06:50-0400") self.assertEqual( doc["submitted_date_first"], "2007-04-25T15:58:28-0400", "Should be populated from submitted_date_all", ) self.assertEqual( doc["submitted_date_latest"], "2007-04-25T16:06:50-0400", "Should be populated from submitted_date_all", )
def test_paper_has_one_version(self, mock_meta, mock_tx, mock_idx, mock_client_factory): """The arXiv paper has only one version.""" mock_client = mock.MagicMock() mock_waiter = mock.MagicMock() mock_client.get_waiter.return_value = mock_waiter mock_client_factory.return_value = mock_client processor = consumer.MetadataRecordProcessor(*self.args) mock_docmeta = DocMeta( version=1, paper_id="1234.56789", title="foo", submitted_date="2001-03-02T03:04:05-400", ) mock_meta.retrieve.return_value = mock_docmeta mock_meta.bulk_retrieve.return_value = [mock_docmeta] mock_doc = Document( version=1, paper_id="1234.56789", title="foo", submitted_date=["2001-03-02T03:04:05-400"], ) mock_tx.to_search_document.return_value = mock_doc processor.index_paper("1234.56789") mock_idx.bulk_add_documents.assert_called_once_with([mock_doc])
def test_report_num(self): """Field ``report_num`` is populated from ``report_num``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "report_num": "Physica A, 245 (1997) 181", }) doc = transform.to_search_document(meta) self.assertEqual(doc["report_num"], "Physica A, 245 (1997) 181")
def test_acm_class(self): """Field ``acm_class`` is populated from ``acm_class``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "acm_class": "F.4.1; D.2.4" }) doc = transform.to_search_document(meta) self.assertEqual(doc["acm_class"], ["F.4.1", "D.2.4"])
def test_submitted_date(self): """Field ``submitted_date`` is populated from ``submitted_date``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'submitted_date': '2007-04-25T16:06:50-0400' }) doc = transform.to_search_document(meta) self.assertEqual(doc.submitted_date, '2007-04-25T16:06:50-0400')
def test_doi(self): """Field ``doi`` is populated from ``doi``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'doi': '10.1103/PhysRevD.76.104043' }) doc = transform.to_search_document(meta) self.assertEqual(doc.doi, ['10.1103/PhysRevD.76.104043'])
def test_msc_class(self): """Field ``msc_class`` is populated from ``msc_class``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'msc_class': "03B70,68Q60" }) doc = transform.to_search_document(meta) self.assertEqual(doc.msc_class, ["03B70", "68Q60"])
def test_authors_freeform(self): """Field ``authors_freeform`` is populated from ``authors_utf8``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "authors_utf8": "authors!" }) doc = transform.to_search_document(meta) self.assertEqual(doc["authors_freeform"], "authors!")
def test_metadata_id2(self): """Field ``comments`` is populated from ``comments_utf8``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "comments_utf8": "comments!" }) doc = transform.to_search_document(meta) self.assertEqual(doc["comments"], "comments!")
def test_doi(self): """Field ``doi`` is populated from ``doi``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "doi": "10.1103/PhysRevD.76.104043" }) doc = transform.to_search_document(meta) self.assertEqual(doc["doi"], ["10.1103/PhysRevD.76.104043"])
def test_abstract(self): """Field ``abstract`` is populated from ``abstract_utf8``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'abstract_utf8': 'abstract!' }) doc = transform.to_search_document(meta) self.assertEqual(doc.abstract, 'abstract!')
def test_report_num(self): """Field ``report_num`` is populated from ``report_num``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'report_num': "Physica A, 245 (1997) 181" }) doc = transform.to_search_document(meta) self.assertEqual(doc.report_num, "Physica A, 245 (1997) 181")
def test_is_withdrawn(self): """Field ``is_withdrawn`` is populated from ``is_withdrawn``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'is_withdrawn': False }) doc = transform.to_search_document(meta) self.assertFalse(doc['is_withdrawn'])
def test_acm_class(self): """Field ``acm_class`` is populated from ``acm_class``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'acm_class': "F.4.1; D.2.4" }) doc = transform.to_search_document(meta) self.assertEqual(doc.acm_class, ["F.4.1", "D.2.4"])
def test_title_utf8(self): """Field ``title`` is populated from ``title_utf8``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'title_utf8': 'foö title' }) doc = transform.to_search_document(meta) self.assertEqual(doc['title'], 'foö title')
def test_metadata_id(self): """Field ``comments`` is populated from ``comments_utf8``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'comments_utf8': 'comments!' }) doc = transform.to_search_document(meta) self.assertEqual(doc.comments, 'comments!')
def test_version(self): """Field ``version`` is populated from ``version``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'version': 25 }) doc = transform.to_search_document(meta) self.assertEqual(doc['version'], 25)
def test_authors_freeform(self): """Field ``authors_freeform`` is populated from ``authors_utf8``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'authors_utf8': 'authors!' }) doc = transform.to_search_document(meta) self.assertEqual(doc.authors_freeform, 'authors!')
def test_proxy(self): """Field ``proxy`` is populated from ``proxy``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'proxy': True }) doc = transform.to_search_document(meta) self.assertTrue(doc['proxy'])
def test_abstract(self): """Field ``abstract`` is populated from ``abstract_utf8``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "abstract_utf8": "abstract!" }) doc = transform.to_search_document(meta) self.assertEqual(doc["abstract"], "abstract!")
def test_announced_date_first(self): """``announced_date_first`` populated from ``announced_date_first``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'announced_date_first': '2007-04' }) doc = transform.to_search_document(meta) self.assertEqual(doc.announced_date_first, '2007-04')
def test_metadata_id(self): """Field ``metadata_id`` is populated from ``metadata_id``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'metadata_id': '690776' }) doc = transform.to_search_document(meta) self.assertEqual(doc['metadata_id'], '690776')
def test_msc_class(self): """Field ``msc_class`` is populated from ``msc_class``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "msc_class": "03B70,68Q60" }) doc = transform.to_search_document(meta) self.assertEqual(doc["msc_class"], ["03B70", "68Q60"])