Beispiel #1
0
    def test_license(self):
        """Field ``license`` is populated from ``license``."""
        _license = {
            "label": "arXiv.org perpetual, non-exclusive license to"
            " distribute this article",
            "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"
        }
        meta = DocMeta(**{'paper_id': '1234.56789', 'license': _license})
        doc = transform.to_search_document(meta)
        self.assertEqual(doc.license['uri'], _license['uri'])
        self.assertEqual(doc.license['label'], _license['label'])

        meta = DocMeta(**{
            'paper_id': '1234.56789',
            'license': {
                'uri': None,
                'label': None
            }
        })
        doc = transform.to_search_document(meta)
        self.assertEqual(doc.license['uri'], transform.DEFAULT_LICENSE['uri'],
                         "The default license should be used")
        self.assertEqual(doc.license['label'],
                         transform.DEFAULT_LICENSE['label'],
                         "The default license should be used")
Beispiel #2
0
    def test_license(self):
        """Field ``license`` is populated from ``license``."""
        _license = {
            "label": "arXiv.org perpetual, non-exclusive license to"
            " distribute this article",
            "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/",
        }
        meta = DocMeta(**{"paper_id": "1234.56789", "license": _license})
        doc = transform.to_search_document(meta)
        self.assertEqual(doc["license"]["uri"], _license["uri"])
        self.assertEqual(doc["license"]["label"], _license["label"])

        meta = DocMeta(**{
            "paper_id": "1234.56789",
            "license": {
                "uri": None,
                "label": None
            },
        })
        doc = transform.to_search_document(meta)
        self.assertEqual(
            doc["license"]["uri"],
            transform.DEFAULT_LICENSE["uri"],
            "The default license should be used",
        )
        self.assertEqual(
            doc["license"]["label"],
            transform.DEFAULT_LICENSE["label"],
            "The default license should be used",
        )
Beispiel #3
0
 def test_submitted_date_all(self):
     """``submitted_date_all`` is populated from ``submitted_date_all``."""
     meta = DocMeta(
         **{
             "paper_id":
             "1234.56789",
             "submitted_date_all": [
                 "2007-04-25T15:58:28-0400",
                 "2007-04-25T16:06:50-0400",
             ],
             "is_current":
             True,
         })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["submitted_date_all"][0],
                      "2007-04-25T15:58:28-0400")
     self.assertEqual(doc["submitted_date_all"][1],
                      "2007-04-25T16:06:50-0400")
     self.assertEqual(
         doc["submitted_date_first"],
         "2007-04-25T15:58:28-0400",
         "Should be populated from submitted_date_all",
     )
     self.assertEqual(
         doc["submitted_date_latest"],
         "2007-04-25T16:06:50-0400",
         "Should be populated from submitted_date_all",
     )
Beispiel #4
0
    def _transform_to_document(docmeta: DocMeta) -> Document:
        """
        Transform paper :class:`.DocMeta` to a search :class:`.Document`.

        Parameters
        ----------
        docmeta : :class:`DocMeta`
            Metadata for an arXiv paper.

        Returns
        -------
        :class:`.Document`
            A search document ready for indexing.

        Raises
        ------
        DocumentFailed
            Indexing of the document failed. This may have no bearing on the
            success of subsequent papers.

        """
        try:
            document = transform.to_search_document(docmeta)
        except Exception as e:
            # At the moment we don't have any special exceptions.
            logger.error('unhandled exception during transform: %s', e)
            raise DocumentFailed('Could not transform document') from e

        return document
Beispiel #5
0
    def test_transform(self):
        """All of the paper ID and version fields should be set correctly."""
        with open("tests/data/docmeta_bulk.json") as f:
            data = json.load(f)

        docmeta = [DocMeta(**datum) for datum in data]

        documents = [transform.to_search_document(meta) for meta in docmeta]
        for doc in documents:
            self.assertIsNotNone(doc["id"])
            self.assertGreater(len(doc["id"]), 0)
            self.assertIsNotNone(doc["paper_id"])
            self.assertGreater(len(doc["paper_id"]), 0)
            self.assertNotIn("v", doc["paper_id"])
            self.assertIsNotNone(doc["paper_id_v"])
            self.assertGreater(len(doc["paper_id_v"]), 0)
            self.assertIn("v", doc["paper_id_v"])
            self.assertIsNotNone(doc["version"])
            self.assertGreater(doc["version"], 0)

            if doc["version"] == 2:
                self.assertEqual(doc["latest"], f"{doc['paper_id']}v2")
                self.assertTrue(doc["is_current"])
                self.assertEqual(doc["id"], doc["paper_id_v"])
            else:
                self.assertFalse(doc["is_current"])
                self.assertEqual(doc["id"], doc["paper_id_v"])
            self.assertEqual(doc["latest_version"], 2)
Beispiel #6
0
    def test_transform(self):
        """All of the paper ID and version fields should be set correctly."""
        with open('tests/data/docmeta_bulk.json') as f:
            data = json.load(f)

        docmeta = [DocMeta(**datum) for datum in data]

        documents = [transform.to_search_document(meta) for meta in docmeta]
        for doc in documents:
            self.assertIsNotNone(doc.id)
            self.assertGreater(len(doc.id), 0)
            self.assertIsNotNone(doc.paper_id)
            self.assertGreater(len(doc.paper_id), 0)
            self.assertNotIn('v', doc.paper_id)
            self.assertIsNotNone(doc.paper_id_v)
            self.assertGreater(len(doc.paper_id_v), 0)
            self.assertIn('v', doc.paper_id_v)
            self.assertIsNotNone(doc.version)
            self.assertGreater(doc.version, 0)

            if doc.version == 2:
                self.assertEqual(doc.latest, f"{doc.paper_id}v2")
                self.assertTrue(doc.is_current)
                self.assertEqual(doc.id, doc.paper_id_v)
            else:
                self.assertFalse(doc.is_current)
                self.assertEqual(doc.id, doc.paper_id_v)
            self.assertEqual(doc.latest_version, 2)
Beispiel #7
0
 def test_announced_date_first(self):
     """``announced_date_first`` populated from ``announced_date_first``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'announced_date_first': '2007-04'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.announced_date_first, '2007-04')
Beispiel #8
0
 def test_metadata_id(self):
     """Field ``metadata_id`` is populated from ``metadata_id``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'metadata_id': '690776'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc['metadata_id'], '690776')
Beispiel #9
0
 def test_proxy(self):
     """Field ``proxy`` is populated from ``proxy``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'proxy': True
     })
     doc = transform.to_search_document(meta)
     self.assertTrue(doc['proxy'])
Beispiel #10
0
 def test_announced_date_first(self):
     """``announced_date_first`` populated from ``announced_date_first``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "announced_date_first": "2007-04"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["announced_date_first"], "2007-04")
Beispiel #11
0
 def test_msc_class(self):
     """Field ``msc_class`` is populated from ``msc_class``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "msc_class": "03B70,68Q60"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["msc_class"], ["03B70", "68Q60"])
Beispiel #12
0
 def test_report_num(self):
     """Field ``report_num`` is populated from ``report_num``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'report_num': "Physica A, 245 (1997) 181"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.report_num, "Physica A, 245 (1997) 181")
Beispiel #13
0
 def test_acm_class(self):
     """Field ``acm_class`` is populated from ``acm_class``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'acm_class': "F.4.1; D.2.4"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.acm_class, ["F.4.1", "D.2.4"])
Beispiel #14
0
 def test_authors_freeform(self):
     """Field ``authors_freeform`` is populated from ``authors_utf8``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "authors_utf8": "authors!"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["authors_freeform"], "authors!")
Beispiel #15
0
 def test_metadata_id2(self):
     """Field ``comments`` is populated from ``comments_utf8``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "comments_utf8": "comments!"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["comments"], "comments!")
Beispiel #16
0
 def test_doi(self):
     """Field ``doi`` is populated from ``doi``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "doi": "10.1103/PhysRevD.76.104043"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["doi"], ["10.1103/PhysRevD.76.104043"])
Beispiel #17
0
 def test_acm_class(self):
     """Field ``acm_class`` is populated from ``acm_class``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "acm_class": "F.4.1; D.2.4"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["acm_class"], ["F.4.1", "D.2.4"])
Beispiel #18
0
 def test_abstract(self):
     """Field ``abstract`` is populated from ``abstract_utf8``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "abstract_utf8": "abstract!"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["abstract"], "abstract!")
Beispiel #19
0
 def test_metadata_id(self):
     """Field ``comments`` is populated from ``comments_utf8``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'comments_utf8': 'comments!'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.comments, 'comments!')
Beispiel #20
0
 def test_abstract(self):
     """Field ``abstract`` is populated from ``abstract_utf8``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'abstract_utf8': 'abstract!'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.abstract, 'abstract!')
Beispiel #21
0
 def test_title_utf8(self):
     """Field ``title`` is populated from ``title_utf8``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'title_utf8': 'foö title'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc['title'], 'foö title')
Beispiel #22
0
 def test_msc_class(self):
     """Field ``msc_class`` is populated from ``msc_class``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'msc_class': "03B70,68Q60"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.msc_class, ["03B70", "68Q60"])
Beispiel #23
0
 def test_report_num(self):
     """Field ``report_num`` is populated from ``report_num``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "report_num": "Physica A, 245 (1997) 181",
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["report_num"], "Physica A, 245 (1997) 181")
Beispiel #24
0
 def test_doi(self):
     """Field ``doi`` is populated from ``doi``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'doi': '10.1103/PhysRevD.76.104043'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.doi, ['10.1103/PhysRevD.76.104043'])
Beispiel #25
0
 def test_is_withdrawn(self):
     """Field ``is_withdrawn`` is populated from ``is_withdrawn``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'is_withdrawn': False
     })
     doc = transform.to_search_document(meta)
     self.assertFalse(doc['is_withdrawn'])
Beispiel #26
0
 def test_version(self):
     """Field ``version`` is populated from ``version``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'version': 25
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc['version'], 25)
Beispiel #27
0
 def test_updated_date(self):
     """Field ``updated_date`` is populated from ``updated_date``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "updated_date": "2007-04-25T16:06:50-0400",
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["updated_date"], "2007-04-25T16:06:50-0400")
Beispiel #28
0
 def test_authors_freeform(self):
     """Field ``authors_freeform`` is populated from ``authors_utf8``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'authors_utf8': 'authors!'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.authors_freeform, 'authors!')
Beispiel #29
0
 def test_submitted_date(self):
     """Field ``submitted_date`` is populated from ``submitted_date``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'submitted_date': '2007-04-25T16:06:50-0400'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.submitted_date, '2007-04-25T16:06:50-0400')
Beispiel #30
0
 def test_source(self):
     """Field ``source`` is populated from ``source``."""
     _source = {"flags": "1", "format": "pdf", "size_bytes": 1230119}
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'source': _source
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc['source'], _source)