def test_license(self): """Field ``license`` is populated from ``license``.""" _license = { "label": "arXiv.org perpetual, non-exclusive license to" " distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/" } meta = DocMeta(**{'paper_id': '1234.56789', 'license': _license}) doc = transform.to_search_document(meta) self.assertEqual(doc.license['uri'], _license['uri']) self.assertEqual(doc.license['label'], _license['label']) meta = DocMeta(**{ 'paper_id': '1234.56789', 'license': { 'uri': None, 'label': None } }) doc = transform.to_search_document(meta) self.assertEqual(doc.license['uri'], transform.DEFAULT_LICENSE['uri'], "The default license should be used") self.assertEqual(doc.license['label'], transform.DEFAULT_LICENSE['label'], "The default license should be used")
def test_license(self): """Field ``license`` is populated from ``license``.""" _license = { "label": "arXiv.org perpetual, non-exclusive license to" " distribute this article", "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", } meta = DocMeta(**{"paper_id": "1234.56789", "license": _license}) doc = transform.to_search_document(meta) self.assertEqual(doc["license"]["uri"], _license["uri"]) self.assertEqual(doc["license"]["label"], _license["label"]) meta = DocMeta(**{ "paper_id": "1234.56789", "license": { "uri": None, "label": None }, }) doc = transform.to_search_document(meta) self.assertEqual( doc["license"]["uri"], transform.DEFAULT_LICENSE["uri"], "The default license should be used", ) self.assertEqual( doc["license"]["label"], transform.DEFAULT_LICENSE["label"], "The default license should be used", )
def test_submitted_date_all(self): """``submitted_date_all`` is populated from ``submitted_date_all``.""" meta = DocMeta( **{ "paper_id": "1234.56789", "submitted_date_all": [ "2007-04-25T15:58:28-0400", "2007-04-25T16:06:50-0400", ], "is_current": True, }) doc = transform.to_search_document(meta) self.assertEqual(doc["submitted_date_all"][0], "2007-04-25T15:58:28-0400") self.assertEqual(doc["submitted_date_all"][1], "2007-04-25T16:06:50-0400") self.assertEqual( doc["submitted_date_first"], "2007-04-25T15:58:28-0400", "Should be populated from submitted_date_all", ) self.assertEqual( doc["submitted_date_latest"], "2007-04-25T16:06:50-0400", "Should be populated from submitted_date_all", )
def _transform_to_document(docmeta: DocMeta) -> Document: """ Transform paper :class:`.DocMeta` to a search :class:`.Document`. Parameters ---------- docmeta : :class:`DocMeta` Metadata for an arXiv paper. Returns ------- :class:`.Document` A search document ready for indexing. Raises ------ DocumentFailed Indexing of the document failed. This may have no bearing on the success of subsequent papers. """ try: document = transform.to_search_document(docmeta) except Exception as e: # At the moment we don't have any special exceptions. logger.error('unhandled exception during transform: %s', e) raise DocumentFailed('Could not transform document') from e return document
def test_transform(self): """All of the paper ID and version fields should be set correctly.""" with open("tests/data/docmeta_bulk.json") as f: data = json.load(f) docmeta = [DocMeta(**datum) for datum in data] documents = [transform.to_search_document(meta) for meta in docmeta] for doc in documents: self.assertIsNotNone(doc["id"]) self.assertGreater(len(doc["id"]), 0) self.assertIsNotNone(doc["paper_id"]) self.assertGreater(len(doc["paper_id"]), 0) self.assertNotIn("v", doc["paper_id"]) self.assertIsNotNone(doc["paper_id_v"]) self.assertGreater(len(doc["paper_id_v"]), 0) self.assertIn("v", doc["paper_id_v"]) self.assertIsNotNone(doc["version"]) self.assertGreater(doc["version"], 0) if doc["version"] == 2: self.assertEqual(doc["latest"], f"{doc['paper_id']}v2") self.assertTrue(doc["is_current"]) self.assertEqual(doc["id"], doc["paper_id_v"]) else: self.assertFalse(doc["is_current"]) self.assertEqual(doc["id"], doc["paper_id_v"]) self.assertEqual(doc["latest_version"], 2)
def test_transform(self): """All of the paper ID and version fields should be set correctly.""" with open('tests/data/docmeta_bulk.json') as f: data = json.load(f) docmeta = [DocMeta(**datum) for datum in data] documents = [transform.to_search_document(meta) for meta in docmeta] for doc in documents: self.assertIsNotNone(doc.id) self.assertGreater(len(doc.id), 0) self.assertIsNotNone(doc.paper_id) self.assertGreater(len(doc.paper_id), 0) self.assertNotIn('v', doc.paper_id) self.assertIsNotNone(doc.paper_id_v) self.assertGreater(len(doc.paper_id_v), 0) self.assertIn('v', doc.paper_id_v) self.assertIsNotNone(doc.version) self.assertGreater(doc.version, 0) if doc.version == 2: self.assertEqual(doc.latest, f"{doc.paper_id}v2") self.assertTrue(doc.is_current) self.assertEqual(doc.id, doc.paper_id_v) else: self.assertFalse(doc.is_current) self.assertEqual(doc.id, doc.paper_id_v) self.assertEqual(doc.latest_version, 2)
def test_announced_date_first(self): """``announced_date_first`` populated from ``announced_date_first``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'announced_date_first': '2007-04' }) doc = transform.to_search_document(meta) self.assertEqual(doc.announced_date_first, '2007-04')
def test_metadata_id(self): """Field ``metadata_id`` is populated from ``metadata_id``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'metadata_id': '690776' }) doc = transform.to_search_document(meta) self.assertEqual(doc['metadata_id'], '690776')
def test_proxy(self): """Field ``proxy`` is populated from ``proxy``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'proxy': True }) doc = transform.to_search_document(meta) self.assertTrue(doc['proxy'])
def test_announced_date_first(self): """``announced_date_first`` populated from ``announced_date_first``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "announced_date_first": "2007-04" }) doc = transform.to_search_document(meta) self.assertEqual(doc["announced_date_first"], "2007-04")
def test_msc_class(self): """Field ``msc_class`` is populated from ``msc_class``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "msc_class": "03B70,68Q60" }) doc = transform.to_search_document(meta) self.assertEqual(doc["msc_class"], ["03B70", "68Q60"])
def test_report_num(self): """Field ``report_num`` is populated from ``report_num``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'report_num': "Physica A, 245 (1997) 181" }) doc = transform.to_search_document(meta) self.assertEqual(doc.report_num, "Physica A, 245 (1997) 181")
def test_acm_class(self): """Field ``acm_class`` is populated from ``acm_class``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'acm_class': "F.4.1; D.2.4" }) doc = transform.to_search_document(meta) self.assertEqual(doc.acm_class, ["F.4.1", "D.2.4"])
def test_authors_freeform(self): """Field ``authors_freeform`` is populated from ``authors_utf8``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "authors_utf8": "authors!" }) doc = transform.to_search_document(meta) self.assertEqual(doc["authors_freeform"], "authors!")
def test_metadata_id2(self): """Field ``comments`` is populated from ``comments_utf8``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "comments_utf8": "comments!" }) doc = transform.to_search_document(meta) self.assertEqual(doc["comments"], "comments!")
def test_doi(self): """Field ``doi`` is populated from ``doi``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "doi": "10.1103/PhysRevD.76.104043" }) doc = transform.to_search_document(meta) self.assertEqual(doc["doi"], ["10.1103/PhysRevD.76.104043"])
def test_acm_class(self): """Field ``acm_class`` is populated from ``acm_class``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "acm_class": "F.4.1; D.2.4" }) doc = transform.to_search_document(meta) self.assertEqual(doc["acm_class"], ["F.4.1", "D.2.4"])
def test_abstract(self): """Field ``abstract`` is populated from ``abstract_utf8``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "abstract_utf8": "abstract!" }) doc = transform.to_search_document(meta) self.assertEqual(doc["abstract"], "abstract!")
def test_metadata_id(self): """Field ``comments`` is populated from ``comments_utf8``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'comments_utf8': 'comments!' }) doc = transform.to_search_document(meta) self.assertEqual(doc.comments, 'comments!')
def test_abstract(self): """Field ``abstract`` is populated from ``abstract_utf8``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'abstract_utf8': 'abstract!' }) doc = transform.to_search_document(meta) self.assertEqual(doc.abstract, 'abstract!')
def test_title_utf8(self): """Field ``title`` is populated from ``title_utf8``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'title_utf8': 'foö title' }) doc = transform.to_search_document(meta) self.assertEqual(doc['title'], 'foö title')
def test_msc_class(self): """Field ``msc_class`` is populated from ``msc_class``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'msc_class': "03B70,68Q60" }) doc = transform.to_search_document(meta) self.assertEqual(doc.msc_class, ["03B70", "68Q60"])
def test_report_num(self): """Field ``report_num`` is populated from ``report_num``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "report_num": "Physica A, 245 (1997) 181", }) doc = transform.to_search_document(meta) self.assertEqual(doc["report_num"], "Physica A, 245 (1997) 181")
def test_doi(self): """Field ``doi`` is populated from ``doi``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'doi': '10.1103/PhysRevD.76.104043' }) doc = transform.to_search_document(meta) self.assertEqual(doc.doi, ['10.1103/PhysRevD.76.104043'])
def test_is_withdrawn(self): """Field ``is_withdrawn`` is populated from ``is_withdrawn``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'is_withdrawn': False }) doc = transform.to_search_document(meta) self.assertFalse(doc['is_withdrawn'])
def test_version(self): """Field ``version`` is populated from ``version``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'version': 25 }) doc = transform.to_search_document(meta) self.assertEqual(doc['version'], 25)
def test_updated_date(self): """Field ``updated_date`` is populated from ``updated_date``.""" meta = DocMeta(**{ "paper_id": "1234.56789", "updated_date": "2007-04-25T16:06:50-0400", }) doc = transform.to_search_document(meta) self.assertEqual(doc["updated_date"], "2007-04-25T16:06:50-0400")
def test_authors_freeform(self): """Field ``authors_freeform`` is populated from ``authors_utf8``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'authors_utf8': 'authors!' }) doc = transform.to_search_document(meta) self.assertEqual(doc.authors_freeform, 'authors!')
def test_submitted_date(self): """Field ``submitted_date`` is populated from ``submitted_date``.""" meta = DocMeta(**{ 'paper_id': '1234.56789', 'submitted_date': '2007-04-25T16:06:50-0400' }) doc = transform.to_search_document(meta) self.assertEqual(doc.submitted_date, '2007-04-25T16:06:50-0400')
def test_source(self): """Field ``source`` is populated from ``source``.""" _source = {"flags": "1", "format": "pdf", "size_bytes": 1230119} meta = DocMeta(**{ 'paper_id': '1234.56789', 'source': _source }) doc = transform.to_search_document(meta) self.assertEqual(doc['source'], _source)