Example #1
0
 def _transform_latest(cls, document: Document) -> Optional[dict]:
     latest = document.get('latest')
     if latest is None:
         return None
     return {
         "paper_id": latest,
         "href": url_for("api.paper", paper_id=document['paper_id'],
                         version=document.get('latest_version'),
                         _external=True),
         "canonical": url_for("abs", paper_id=document['paper_id'],
                              version=document.get('latest_version')),
         "version": document.get('latest_version')
     }
    def test_paper_has_one_version(self, mock_meta, mock_tx, mock_idx,
                                   mock_client_factory):
        """The arXiv paper has only one version."""
        mock_client = mock.MagicMock()
        mock_waiter = mock.MagicMock()
        mock_client.get_waiter.return_value = mock_waiter
        mock_client_factory.return_value = mock_client
        processor = consumer.MetadataRecordProcessor(*self.args)

        mock_docmeta = DocMeta(
            version=1,
            paper_id="1234.56789",
            title="foo",
            submitted_date="2001-03-02T03:04:05-400",
        )
        mock_meta.retrieve.return_value = mock_docmeta
        mock_meta.bulk_retrieve.return_value = [mock_docmeta]

        mock_doc = Document(
            version=1,
            paper_id="1234.56789",
            title="foo",
            submitted_date=["2001-03-02T03:04:05-400"],
        )
        mock_tx.to_search_document.return_value = mock_doc

        processor.index_paper("1234.56789")

        mock_idx.bulk_add_documents.assert_called_once_with([mock_doc])
Example #3
0
 def test_index_returns_result(self, mock_index):
     """Test returns 'OK' + status 200 when index returns results."""
     mock_index.search.return_value = DocumentSet({}, [Document()])
     response, status_code, _ = health_check()
     self.assertEqual(response, 'OK', "Response content should be OK")
     self.assertEqual(status_code, status.HTTP_200_OK,
                      "Should return 200 status code.")
Example #4
0
    def transform_document(cls, doc: Document,
                           query: Optional[APIQuery] = None) -> dict:
        """Select a subset of :class:`Document` properties for public API."""
        # Only return fields that have been explicitly requested.
        data = {key: value for key, value in doc.items()
                if query is None or key in query.include_fields}
        paper_id = doc['paper_id']
        version = doc['version']
        if 'submitted_date_first' in data:
            data['submitted_date_first'] = \
                doc['submitted_date_first'].isoformat()
        if 'announced_date_first' in data:
            data['announced_date_first'] = \
                doc['announced_date_first'].isoformat()
        if 'formats' in data:
            data['formats'] = [cls._transform_format(fmt, paper_id, version)
                               for fmt in doc['formats']]
        if 'license' in data:
            data['license'] = cls._transform_license(doc['license'])
        if 'latest' in data:
            data['latest'] = cls._transform_latest(doc)

        data['href'] = url_for("api.paper", paper_id=paper_id,
                               version=version, _external=True)
        data['canonical'] = url_for("abs", paper_id=paper_id,
                                    version=version)
        return data
Example #5
0
    def transform_document(
        self, doc: Document, query: Optional[APIQuery] = None
    ) -> Dict[str, Any]:
        """Select a subset of :class:`Document` properties for public API."""
        # Only return fields that have been explicitly requested.
        data = {
            key: value
            for key, value in doc.items()
            if query is None or key in query.include_fields
        }
        paper_id = doc["paper_id"]
        version = doc["version"]
        if "submitted_date_first" in data:
            data["submitted_date_first"] = doc[
                "submitted_date_first"
            ].isoformat()
        if "announced_date_first" in data:
            data["announced_date_first"] = doc[
                "announced_date_first"
            ].isoformat()
        if "formats" in data:
            data["formats"] = [
                self._transform_format(fmt, paper_id, version)
                for fmt in doc["formats"]
            ]
        if "license" in data:
            data["license"] = self._transform_license(doc["license"])
        if "latest" in data:
            data["latest"] = self._transform_latest(doc)

        data["href"] = url_for(
            "api.paper", paper_id=paper_id, version=version, _external=True
        )
        data["canonical"] = url_for("abs", paper_id=paper_id, version=version)
        return data
Example #6
0
    def get_document(self, document_id: int) -> Document:
        """
        Retrieve a document from the index by ID.

        Uses ``metadata_id`` as the primary identifier for the document.

        Parameters
        ----------
        doument_id : int
            Value of ``metadata_id`` in the original document.

        Returns
        -------
        :class:`.Document`

        Raises
        ------
        IndexConnectionError
            Problem communicating with the search index.
        QueryError
            Invalid query parameters.

        """
        with handle_es_exceptions():
            record = self.es.get(index=self.index,
                                 doc_type=self.doc_type,
                                 id=document_id)

        if not record:
            logger.error("No such document: %s", document_id)
            raise DocumentNotFound('No such document')
        return Document(**record['_source'])  # type: ignore
Example #7
0
 def _transform_latest(document: Document) -> Optional[Dict[str, str]]:
     latest = document.get("latest")
     if latest is None:
         return None
     return {  # type:ignore
         "paper_id": latest,
         "href": url_for(
             "api.paper",
             paper_id=document["paper_id"],
             version=document.get("latest_version"),
             _external=True,
         ),
         "canonical": url_for(
             "abs",
             paper_id=document["paper_id"],
             version=document.get("latest_version"),
         ),
         "version": document.get("latest_version"),
     }
    def test_index_raises_unhandled_error(self, mock_index,
                                          mock_client_factory):
        """The index raises an unhandled exception."""
        mock_client = mock.MagicMock()
        mock_waiter = mock.MagicMock()
        mock_client.get_waiter.return_value = mock_waiter
        mock_client_factory.return_value = mock_client
        processor = consumer.MetadataRecordProcessor(*self.args)

        mock_index.bulk_add_documents.side_effect = RuntimeError
        with self.assertRaises(consumer.IndexingFailed):
            processor._bulk_add_to_index([Document()])
    def test_index_raises_index_connection_error(self, mock_index,
                                                 mock_client_factory):
        """The index raises :class:`.index.IndexConnectionError`."""
        mock_client = mock.MagicMock()
        mock_waiter = mock.MagicMock()
        mock_client.get_waiter.return_value = mock_waiter
        mock_client_factory.return_value = mock_client
        processor = consumer.MetadataRecordProcessor(*self.args)

        mock_index.bulk_add_documents.side_effect = index.IndexConnectionError
        with self.assertRaises(consumer.IndexingFailed):
            processor._bulk_add_to_index([Document()])
 def test_add_document_succeeds(self, mock_index, mock_client_factory):
     """The search document is added successfully."""
     mock_client = mock.MagicMock()
     mock_waiter = mock.MagicMock()
     mock_client.get_waiter.return_value = mock_waiter
     mock_client_factory.return_value = mock_client
     processor = consumer.MetadataRecordProcessor(*self.args)
     try:
         processor._add_to_index(Document())
     except Exception as ex:
         self.fail(ex)
     mock_index.add_document.assert_called_once()
Example #11
0
def to_document(raw: Union[Hit, dict], highlight: bool = True) -> Document:
    """Transform an ES search result back into a :class:`.Document`."""
    # typing: ignore
    result: Dict[str, Any] = {}

    result['match'] = {}  # Hit on field, but no highlighting.
    result['truncated'] = {}  # Preview is truncated.

    for key in Document.fields():
        if type(raw) is Hit:
            if not hasattr(raw, key):
                continue
            value = getattr(raw, key)

        elif type(raw) is dict:
            if key not in raw:
                continue
            value = raw.get(key)
        else:
            continue

        # We want to prevent ES-specific data types from escaping the module
        # API.
        if isinstance(value, AttrList):
            value = value._l_
        elif isinstance(value, AttrDict):
            value = value.to_dict()

        if key == 'primary_classification':
            value = Classification(**value)  # type: ignore
        elif key == 'secondary_classification':
            value = [Classification(**v) for v in value]  # type: ignore
        elif key in ['authors', 'owners']:
            value = [_to_author(au) for au in value]
        elif key == 'submitter':
            value = _to_author(value)

        elif key == 'announced_date_first' and \
                value and isinstance(value, str):
            value = datetime.strptime(value, '%Y-%m').date()
        elif key in [
                'submitted_date', 'submitted_date_first',
                'submitted_date_latest'
        ]:
            try:
                value = datetime.strptime(value, '%Y-%m-%dT%H:%M:%S%z')
            except (ValueError, TypeError):
                logger.warning(f'Could not parse {key}: {value} as datetime')
                pass
        elif key in ['acm_class', 'msc_class'] and value:
            value = '; '.join(value)

        result[key] = value

    if type(raw) is Response:
        result['score'] = raw.meta.score  # type: ignore

    if type(result.get('abstract')) is str and highlight:
        if 'preview' not in result:
            result['preview'] = {}
        result['preview']['abstract'] = preview(result['abstract'])
        if result['preview']['abstract'].endswith('…'):
            result['truncated']['abstract'] = True

    if highlight and type(raw) in [Response, Hit]:
        result['highlight'] = {}
        logger.debug('%s: add highlighting to result',
                     raw.paper_id)  # type: ignore
        result = add_highlighting(result, raw)

    return Document(**result)  # type: ignore
    def test_paper_has_three_versions(self, mock_meta, mock_tx, mock_idx,
                                      mock_client_factory):
        """The arXiv paper has three versions."""
        mock_client = mock.MagicMock()
        mock_waiter = mock.MagicMock()
        mock_client.get_waiter.return_value = mock_waiter
        mock_client_factory.return_value = mock_client
        processor = consumer.MetadataRecordProcessor(*self.args)

        mock_dm_1 = DocMeta(
            version=1,
            paper_id="1234.56789",
            title="foo",
            submitted_date="2001-03-02T03:04:05-400",
        )
        mock_dm_2 = DocMeta(
            version=2,
            paper_id="1234.56789",
            title="foo",
            submitted_date="2001-03-03T03:04:05-400",
        )
        mock_dm_3 = DocMeta(
            version=3,
            paper_id="1234.56789",
            title="foo",
            submitted_date="2001-03-04T03:04:05-400",
        )
        mock_meta.retrieve.side_effect = [mock_dm_3, mock_dm_1, mock_dm_2]

        mock_meta.bulk_retrieve.return_value = [
            mock_dm_3,
            mock_dm_1,
            mock_dm_2,
            mock_dm_3,
        ]

        mock_doc_1 = Document(
            version=1,
            paper_id="1234.56789",
            title="foo",
            submitted_date=["2001-03-02T03:04:05-400"],
            submitted_date_all=["2001-03-02T03:04:05-400"],
        )
        mock_doc_2 = Document(
            version=2,
            paper_id="1234.56789",
            title="foo",
            submitted_date=["2001-03-03T03:04:05-400"],
            submitted_date_all=[
                "2001-03-02T03:04:05-400",
                "2001-03-03T03:04:05-400",
            ],
        )
        mock_doc_3 = Document(
            version=3,
            paper_id="1234.56789",
            title="foo",
            submitted_date=["2001-03-04T03:04:05-400"],
            submitted_date_all=[
                "2001-03-02T03:04:05-400",
                "2001-03-03T03:04:05-400",
                "2001-03-04T03:04:05-400",
            ],
        )
        mock_tx.to_search_document.side_effect = [
            mock_doc_3,
            mock_doc_1,
            mock_doc_2,
            mock_doc_3,
        ]
        processor.index_paper("1234.56789")
        self.assertEqual(
            mock_meta.bulk_retrieve.call_count,
            1,
            "Metadata should be retrieved for current version"
            " with bulk_retrieve",
        )
        self.assertEqual(
            mock_meta.retrieve.call_count,
            0,
            "Metadata should be retrieved for each non-current"
            " version",
        )

        mock_idx.bulk_add_documents.assert_called_once_with(
            [mock_doc_3, mock_doc_1, mock_doc_2, mock_doc_3])
Example #13
0
    def transform_document(
        cls,
        fg: FeedGenerator,
        doc: Document,
        query: Optional[ClassicAPIQuery] = None,
    ) -> None:
        """Select a subset of :class:`Document` properties for public API."""
        entry = fg.add_entry()
        entry.id(
            url_for(
                "abs",
                paper_id=doc["paper_id"],
                version=doc["version"],
                _external=True,
            ))
        entry.title(doc["title"])
        entry.summary(doc["abstract"])
        entry.published(to_utc(doc["submitted_date"]))
        entry.updated(to_utc(doc["updated_date"]))
        entry.link({
            "href":
            url_for(
                "abs",
                paper_id=doc["paper_id"],
                version=doc["version"],
                _external=True,
            ),
            "type":
            "text/html",
        })

        entry.link({
            "href":
            url_for(
                "pdf",
                paper_id=doc["paper_id"],
                version=doc["version"],
                _external=True,
            ),
            "type":
            "application/pdf",
            "rel":
            "related",
            "title":
            "pdf",
        })

        if doc.get("comments"):
            entry.arxiv.comment(doc["comments"])

        if doc.get("journal_ref"):
            entry.arxiv.journal_ref(doc["journal_ref"])

        if doc.get("doi"):
            entry.arxiv.doi(doc["doi"])

        if doc["primary_classification"]["category"] is not None:
            entry.arxiv.primary_category(
                doc["primary_classification"]["category"]["id"])
            entry.category(
                term=doc["primary_classification"]["category"]["id"],
                scheme=ARXIV_NS,
            )

        for category in doc["secondary_classification"]:
            entry.category(term=category["category"]["id"], scheme=ARXIV_NS)

        for author in doc["authors"]:
            author_data: Dict[str, Any] = {"name": author["full_name"]}
            if author.get("affiliation"):
                author_data["affiliation"] = author["affiliation"]
            entry.arxiv.author(author_data)