Esempio n. 1
0
    def test_license(self):
        """Field ``license`` is populated from ``license``."""
        _license = {
            "label": "arXiv.org perpetual, non-exclusive license to"
            " distribute this article",
            "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"
        }
        meta = DocMeta(**{'paper_id': '1234.56789', 'license': _license})
        doc = transform.to_search_document(meta)
        self.assertEqual(doc.license['uri'], _license['uri'])
        self.assertEqual(doc.license['label'], _license['label'])

        meta = DocMeta(**{
            'paper_id': '1234.56789',
            'license': {
                'uri': None,
                'label': None
            }
        })
        doc = transform.to_search_document(meta)
        self.assertEqual(doc.license['uri'], transform.DEFAULT_LICENSE['uri'],
                         "The default license should be used")
        self.assertEqual(doc.license['label'],
                         transform.DEFAULT_LICENSE['label'],
                         "The default license should be used")
Esempio n. 2
0
    def test_license(self):
        """Field ``license`` is populated from ``license``."""
        _license = {
            "label": "arXiv.org perpetual, non-exclusive license to"
            " distribute this article",
            "uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/",
        }
        meta = DocMeta(**{"paper_id": "1234.56789", "license": _license})
        doc = transform.to_search_document(meta)
        self.assertEqual(doc["license"]["uri"], _license["uri"])
        self.assertEqual(doc["license"]["label"], _license["label"])

        meta = DocMeta(**{
            "paper_id": "1234.56789",
            "license": {
                "uri": None,
                "label": None
            },
        })
        doc = transform.to_search_document(meta)
        self.assertEqual(
            doc["license"]["uri"],
            transform.DEFAULT_LICENSE["uri"],
            "The default license should be used",
        )
        self.assertEqual(
            doc["license"]["label"],
            transform.DEFAULT_LICENSE["label"],
            "The default license should be used",
        )
Esempio n. 3
0
def from_cache(cache_dir: str, arxiv_id: str) -> List[DocMeta]:
    """
    Get the docmeta document from a local cache, if available.

    Parameters
    ----------
    arxiv_id : str

    Returns
    -------
    :class:`.DocMeta`

    Raises
    ------
    RuntimeError
        Raised when the cache is not available, or the document could not
        be found in the cache.

    """
    fname = '%s.json' % arxiv_id.replace('/', '_')
    cache_path = os.path.join(cache_dir, fname)
    if not os.path.exists(cache_path):
        raise RuntimeError('No cached document')

    with open(cache_path) as f:
        data: dict = json.load(f)
        return [DocMeta(**datum) for datum in data]  # type: ignore
Esempio n. 4
0
def from_cache(cache_dir: str, arxiv_id: str) -> List[DocMeta]:
    """
    Get the docmeta document from a local cache, if available.

    Parameters
    ----------
    arxiv_id : str

    Returns
    -------
    :class:`.DocMeta` or None if document is not found in cache    

    """
    try:
        if not cache_dir:
            return []  # caching is disabled
        fname = "%s.json" % arxiv_id.replace("/", "_")
        cache_path = os.path.join(cache_dir, fname)
        if not os.path.exists(cache_path):
            raise RuntimeError("No cached document")

        with open(cache_path) as f:
            data: dict = json.load(f)
            return [DocMeta(**datum) for datum in data]  # type: ignore
            # See https://github.com/python/mypy/issues/3937
    except RuntimeError:
        return []
Esempio n. 5
0
    def bulk_retrieve(self, document_ids: List[str]) -> List[DocMeta]:
        """
        Retrieve metadata for an arXiv paper.

        Parameters
        ----------
        document_ids : List[str]

        Returns
        -------
        dict

        Raises
        ------
        IOError
        ValueError
        """
        if not document_ids:    # This could use further elaboration.
            raise ValueError('Invalid value for document_ids')

        query_string = '/docmeta_bulk?' + '&'.join(
            f'id={document_id}' for document_id in document_ids
        )

        try:
            target = urljoin(self.endpoint, query_string)
            logger.debug(
                f'{document_ids}: retrieve metadata from {target} with SSL'
                f' verify {self._verify_cert}'
            )
            response = self._session.get(target, verify=self._verify_cert)
        except requests.exceptions.SSLError as e:
            logger.error('SSLError: %s', e)
            raise SecurityException('SSL failed: %s' % e) from e
        except requests.exceptions.ConnectionError as e:
            logger.error('ConnectionError: %s', e)
            raise ConnectionFailed(
                'Could not connect to metadata service: %s' % e
            ) from e

        if response.status_code not in \
                [status.HTTP_200_OK, status.HTTP_206_PARTIAL_CONTENT]:
            logger.error('Request failed: %s', response.content)
            raise RequestFailed(
                '%s: failed with %i: %s' % (
                    document_ids, response.status_code, response.content
                )
            )
        logger.debug(f'{document_ids}: response OK')
        try:
            resp = response.json()  # A list with metadata for each paper.
            data: List[DocMeta]
            data = [DocMeta(**value) for value in resp]     # type: ignore
        except json.decoder.JSONDecodeError as e:
            logger.error('JSONDecodeError: %s', e)
            raise BadResponse(
                '%s: could not decode response: %s' % (document_ids, e)
            ) from e
        logger.debug(f'{document_ids}: response decoded; done!')
        return data
Esempio n. 6
0
    def test_transform(self):
        """All of the paper ID and version fields should be set correctly."""
        with open('tests/data/docmeta_bulk.json') as f:
            data = json.load(f)

        docmeta = [DocMeta(**datum) for datum in data]

        documents = [transform.to_search_document(meta) for meta in docmeta]
        for doc in documents:
            self.assertIsNotNone(doc.id)
            self.assertGreater(len(doc.id), 0)
            self.assertIsNotNone(doc.paper_id)
            self.assertGreater(len(doc.paper_id), 0)
            self.assertNotIn('v', doc.paper_id)
            self.assertIsNotNone(doc.paper_id_v)
            self.assertGreater(len(doc.paper_id_v), 0)
            self.assertIn('v', doc.paper_id_v)
            self.assertIsNotNone(doc.version)
            self.assertGreater(doc.version, 0)

            if doc.version == 2:
                self.assertEqual(doc.latest, f"{doc.paper_id}v2")
                self.assertTrue(doc.is_current)
                self.assertEqual(doc.id, doc.paper_id_v)
            else:
                self.assertFalse(doc.is_current)
                self.assertEqual(doc.id, doc.paper_id_v)
            self.assertEqual(doc.latest_version, 2)
Esempio n. 7
0
    def test_transform(self):
        """All of the paper ID and version fields should be set correctly."""
        with open("tests/data/docmeta_bulk.json") as f:
            data = json.load(f)

        docmeta = [DocMeta(**datum) for datum in data]

        documents = [transform.to_search_document(meta) for meta in docmeta]
        for doc in documents:
            self.assertIsNotNone(doc["id"])
            self.assertGreater(len(doc["id"]), 0)
            self.assertIsNotNone(doc["paper_id"])
            self.assertGreater(len(doc["paper_id"]), 0)
            self.assertNotIn("v", doc["paper_id"])
            self.assertIsNotNone(doc["paper_id_v"])
            self.assertGreater(len(doc["paper_id_v"]), 0)
            self.assertIn("v", doc["paper_id_v"])
            self.assertIsNotNone(doc["version"])
            self.assertGreater(doc["version"], 0)

            if doc["version"] == 2:
                self.assertEqual(doc["latest"], f"{doc['paper_id']}v2")
                self.assertTrue(doc["is_current"])
                self.assertEqual(doc["id"], doc["paper_id_v"])
            else:
                self.assertFalse(doc["is_current"])
                self.assertEqual(doc["id"], doc["paper_id_v"])
            self.assertEqual(doc["latest_version"], 2)
Esempio n. 8
0
 def test_submitted_date_all(self):
     """``submitted_date_all`` is populated from ``submitted_date_all``."""
     meta = DocMeta(
         **{
             "paper_id":
             "1234.56789",
             "submitted_date_all": [
                 "2007-04-25T15:58:28-0400",
                 "2007-04-25T16:06:50-0400",
             ],
             "is_current":
             True,
         })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["submitted_date_all"][0],
                      "2007-04-25T15:58:28-0400")
     self.assertEqual(doc["submitted_date_all"][1],
                      "2007-04-25T16:06:50-0400")
     self.assertEqual(
         doc["submitted_date_first"],
         "2007-04-25T15:58:28-0400",
         "Should be populated from submitted_date_all",
     )
     self.assertEqual(
         doc["submitted_date_latest"],
         "2007-04-25T16:06:50-0400",
         "Should be populated from submitted_date_all",
     )
    def test_paper_has_one_version(self, mock_meta, mock_tx, mock_idx,
                                   mock_client_factory):
        """The arXiv paper has only one version."""
        mock_client = mock.MagicMock()
        mock_waiter = mock.MagicMock()
        mock_client.get_waiter.return_value = mock_waiter
        mock_client_factory.return_value = mock_client
        processor = consumer.MetadataRecordProcessor(*self.args)

        mock_docmeta = DocMeta(
            version=1,
            paper_id="1234.56789",
            title="foo",
            submitted_date="2001-03-02T03:04:05-400",
        )
        mock_meta.retrieve.return_value = mock_docmeta
        mock_meta.bulk_retrieve.return_value = [mock_docmeta]

        mock_doc = Document(
            version=1,
            paper_id="1234.56789",
            title="foo",
            submitted_date=["2001-03-02T03:04:05-400"],
        )
        mock_tx.to_search_document.return_value = mock_doc

        processor.index_paper("1234.56789")

        mock_idx.bulk_add_documents.assert_called_once_with([mock_doc])
Esempio n. 10
0
 def test_report_num(self):
     """Field ``report_num`` is populated from ``report_num``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "report_num": "Physica A, 245 (1997) 181",
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["report_num"], "Physica A, 245 (1997) 181")
Esempio n. 11
0
 def test_acm_class(self):
     """Field ``acm_class`` is populated from ``acm_class``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "acm_class": "F.4.1; D.2.4"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["acm_class"], ["F.4.1", "D.2.4"])
Esempio n. 12
0
 def test_submitted_date(self):
     """Field ``submitted_date`` is populated from ``submitted_date``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'submitted_date': '2007-04-25T16:06:50-0400'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.submitted_date, '2007-04-25T16:06:50-0400')
Esempio n. 13
0
 def test_doi(self):
     """Field ``doi`` is populated from ``doi``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'doi': '10.1103/PhysRevD.76.104043'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.doi, ['10.1103/PhysRevD.76.104043'])
Esempio n. 14
0
 def test_msc_class(self):
     """Field ``msc_class`` is populated from ``msc_class``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'msc_class': "03B70,68Q60"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.msc_class, ["03B70", "68Q60"])
Esempio n. 15
0
 def test_authors_freeform(self):
     """Field ``authors_freeform`` is populated from ``authors_utf8``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "authors_utf8": "authors!"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["authors_freeform"], "authors!")
Esempio n. 16
0
 def test_metadata_id2(self):
     """Field ``comments`` is populated from ``comments_utf8``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "comments_utf8": "comments!"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["comments"], "comments!")
Esempio n. 17
0
 def test_doi(self):
     """Field ``doi`` is populated from ``doi``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "doi": "10.1103/PhysRevD.76.104043"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["doi"], ["10.1103/PhysRevD.76.104043"])
Esempio n. 18
0
 def test_abstract(self):
     """Field ``abstract`` is populated from ``abstract_utf8``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'abstract_utf8': 'abstract!'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.abstract, 'abstract!')
Esempio n. 19
0
 def test_report_num(self):
     """Field ``report_num`` is populated from ``report_num``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'report_num': "Physica A, 245 (1997) 181"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.report_num, "Physica A, 245 (1997) 181")
Esempio n. 20
0
 def test_is_withdrawn(self):
     """Field ``is_withdrawn`` is populated from ``is_withdrawn``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'is_withdrawn': False
     })
     doc = transform.to_search_document(meta)
     self.assertFalse(doc['is_withdrawn'])
Esempio n. 21
0
 def test_acm_class(self):
     """Field ``acm_class`` is populated from ``acm_class``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'acm_class': "F.4.1; D.2.4"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.acm_class, ["F.4.1", "D.2.4"])
Esempio n. 22
0
 def test_title_utf8(self):
     """Field ``title`` is populated from ``title_utf8``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'title_utf8': 'foö title'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc['title'], 'foö title')
Esempio n. 23
0
 def test_metadata_id(self):
     """Field ``comments`` is populated from ``comments_utf8``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'comments_utf8': 'comments!'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.comments, 'comments!')
Esempio n. 24
0
 def test_version(self):
     """Field ``version`` is populated from ``version``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'version': 25
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc['version'], 25)
Esempio n. 25
0
 def test_authors_freeform(self):
     """Field ``authors_freeform`` is populated from ``authors_utf8``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'authors_utf8': 'authors!'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.authors_freeform, 'authors!')
Esempio n. 26
0
 def test_proxy(self):
     """Field ``proxy`` is populated from ``proxy``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'proxy': True
     })
     doc = transform.to_search_document(meta)
     self.assertTrue(doc['proxy'])
Esempio n. 27
0
 def test_abstract(self):
     """Field ``abstract`` is populated from ``abstract_utf8``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "abstract_utf8": "abstract!"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["abstract"], "abstract!")
Esempio n. 28
0
 def test_announced_date_first(self):
     """``announced_date_first`` populated from ``announced_date_first``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'announced_date_first': '2007-04'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc.announced_date_first, '2007-04')
Esempio n. 29
0
 def test_metadata_id(self):
     """Field ``metadata_id`` is populated from ``metadata_id``."""
     meta = DocMeta(**{
         'paper_id': '1234.56789',
         'metadata_id': '690776'
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc['metadata_id'], '690776')
Esempio n. 30
0
 def test_msc_class(self):
     """Field ``msc_class`` is populated from ``msc_class``."""
     meta = DocMeta(**{
         "paper_id": "1234.56789",
         "msc_class": "03B70,68Q60"
     })
     doc = transform.to_search_document(meta)
     self.assertEqual(doc["msc_class"], ["03B70", "68Q60"])