Ejemplo n.º 1
0
    def test_identifier_fields(self) -> None:
        """Test individual fields in Identifier object."""
        tid1 = Identifier(arxiv_id='0803.1924')
        self.assertIsInstance(tid1, Identifier, 'valid instance')
        self.assertIsNotNone(tid1.id, 'id is not None')
        self.assertIs(tid1.is_old_id, False, 'id is new type')
        self.assertEqual(tid1.archive, 'arxiv', 'archive is arxiv for new ID')
        self.assertEqual(tid1.yymm, '0803', 'yymm matches')
        self.assertEqual(tid1.year, 2008, 'year matches')
        self.assertEqual(tid1.month, 3, 'month matches')
        self.assertEqual(tid1.num, 1924, 'numerical id matches')
        self.assertEqual(tid1.id, '0803.1924', 'id matches')
        self.assertEqual(tid1.ids, '0803.1924', 'id specified matches')
        self.assertEqual(tid1.filename, '0803.1924', 'filename matches')
        self.assertEqual(tid1.squashed, '0803.1924', 'squashed id matches')
        self.assertEqual(tid1.squashedv, '0803.1924', 'squashed idv matches')
        # tid1_next = tid1.next_id()
        # self.assertIsInstance(tid1_next, Identifier)
        # self.assertEqual(tid1_next.id, '0803.1925', 'next id matches')

        tid2 = Identifier(arxiv_id='hep-th/0701051v4')
        self.assertIsInstance(tid2, Identifier, 'valid instance')
        self.assertIsNotNone(tid2.id, 'id is not None')
        self.assertIs(tid2.is_old_id, True, 'id is old type')
        self.assertEqual(tid2.archive, 'hep-th',
                         'archive is hep-th for old ID')
        self.assertEqual(tid2.yymm, '0701', 'yymm matches')
        self.assertEqual(tid2.year, 2007, 'year matches')
        self.assertEqual(tid2.month, 1, 'month matches')
        self.assertEqual(tid2.num, 51, 'numerical id matches')
        self.assertEqual(tid2.id, 'hep-th/0701051', 'id matches')
        self.assertEqual(tid2.ids, 'hep-th/0701051v4', 'id specified matches')
        self.assertEqual(tid2.filename, '0701051', 'filename matches')
        self.assertEqual(tid2.squashed, 'hep-th0701051', 'squashed id matches')
        self.assertEqual(tid2.squashedv, 'hep-th0701051v4',
                         'squashed idv matches')
        # tid2_next = tid2.next_id()
        # self.assertIsInstance(tid2_next, Identifier)
        # self.assertEqual(tid2_next.id, 'hep-th/0701052', 'next id matches')

        tid3 = Identifier(arxiv_id='1201.0001')
        self.assertIsInstance(tid3, Identifier, 'valid instance')
        self.assertIsNotNone(tid3.id, 'id is not None')
        self.assertIs(tid3.is_old_id, False, 'id is new type')
        self.assertEqual(tid3.yymm, '1201', 'yymm matches')
        self.assertEqual(tid3.year, 2012, 'year matches')
        self.assertEqual(tid3.month, 1, 'month matches')
        self.assertEqual(tid3.num, 1, 'numerical id matches')
        self.assertEqual(tid3.id, '1201.0001', 'id matches')
        self.assertEqual(tid3.ids, '1201.0001', 'id specified matches')
        self.assertEqual(tid3.filename, '1201.0001', 'filename matches')
        self.assertEqual(tid3.squashed, '1201.0001', 'squashed id matches')
        self.assertEqual(tid3.squashedv, '1201.0001', 'squashed idv matches')
Ejemplo n.º 2
0
    def get_previous_id(self, identifier: Identifier) -> Optional[Identifier]:
        """
        Get the previous identifier in sequence if it exists in the repository.

        Under certain conditions this is called to generate the "previous" link
        in the "browse context" portion of the abs page rendering.
        These conditions are dependent on the identifier and context; it
        emulates legacy functionality. It is recommended to deprecate
        this function once the /prevnext route is fixed (or replaced) to
        handle old identifiers correctly.

        Parameters
        ----------
        identifier : :class:`Identifier`

        Returns
        -------
        :class:`Identifier`
            The previous identifier in sequence that exists in the repository.

        """
        previous_id = self._previous_id(identifier)
        if not previous_id:
            return None

        if identifier.year == previous_id.year \
           and identifier.month == previous_id.month:
            return previous_id

        path = self._get_parent_path(previous_id)
        if not os.path.exists(path):
            return None

        for _, _, file_list in os.walk(path):
            abs_files = [f[:-4] for f in file_list if f.endswith('.abs')]
            if not abs_files:
                return None
            max_id = max(abs_files)
            try:
                if previous_id.is_old_id:
                    short_id = Identifier(
                        arxiv_id=f'{previous_id.archive}/{max_id}')
                else:
                    short_id = Identifier(arxiv_id=max_id)
                return short_id

            except IdentifierException:
                return None

        return None
Ejemplo n.º 3
0
 def test_sequential_id(self) -> None:
     self.assertEqual(get_sequential_id(''), None)
     self.assertEqual(
         get_sequential_id(Identifier('0906.3421'), is_next=True),
         '0906.4150')
     self.assertTrue(
         get_sequential_id(Identifier('0906.9150'),
                           is_next=True).startswith('0907'))
     self.assertEqual(
         get_sequential_id(Identifier('0906.3421'), is_next=False),
         '0906.3336')
     self.assertTrue(
         get_sequential_id(Identifier('0907.2020'),
                           is_next=False).startswith('0906'))
Ejemplo n.º 4
0
    def _next_yymm_id(self, identifier: Identifier) -> Optional[Identifier]:
        """Get the first identifier for the next month."""
        next_yymm_id = None
        if identifier.year is not None and \
                identifier.month is not None:
            new_year = identifier.year
            new_month = identifier.month + 1
            new_num = 1
            if new_month > 12:
                new_month = 1
                new_year = new_year + 1
            if identifier.is_old_id:
                next_yymm_id = '{}/{:02d}{:02d}{:03d}'.format(
                    identifier.archive, new_year % 100, new_month, new_num)
            elif new_year >= 2015:
                next_yymm_id = '{:02d}{:02d}.{:05d}'.format(
                    new_year % 100, new_month, new_num)
            else:
                next_yymm_id = '{:02d}{:02d}.{:04d}'.format(
                    new_year % 100, new_month, new_num)

            try:
                return Identifier(arxiv_id=next_yymm_id)
            except IdentifierException:
                return None
        else:
            return None
Ejemplo n.º 5
0
def get_prevnext(id: str, function: str, context: str) -> Response:
    """
    Get the next or previous arXiv ID in the browse context.

    The 'site' parameter from the classic prevnext is no longer supported.

    Parameters
    ----------
    id
        arxiv id
    function
        prev or next
    context
        which archive or category to browse

    Returns
    -------
    dict
        Result response data.
    int
        HTTP status code.
    dict
        Headers to add to the response.

    Raises
    ------
    BadRequest
        Raised when request parameters are missing, invalid, or when an ID
        redirect cannot be returned even when the request parameters are valid.

    """
    if id is None or not id:
        raise BadRequest('Missing article identifier')
    if function not in ['prev', 'next']:
        raise BadRequest('Missing or invalid function request')
    if context is None or not context:
        raise BadRequest('Missing context')
    if not (context in CATEGORIES_ACTIVE
            or context in ARCHIVES or context == 'all'):
        raise BadRequest('Invalid context')

    try:
        arxiv_id = Identifier(id)
    except IdentifierException:
        raise BadRequest(escape(f"Invalid article identifier {id}"))

    seq_id = get_sequential_id(paper_id=arxiv_id,
                               is_next=function == 'next',
                               context=context)
    if not seq_id:
        raise BadRequest(
            escape(f'No {function} article found for '
                   f'{arxiv_id.id} in {context}'))

    redirect_url = url_for('browse.abstract', arxiv_id=seq_id, context=context)
    return {}, status.HTTP_301_MOVED_PERMANENTLY, {'Location': redirect_url}
Ejemplo n.º 6
0
def get_tb_page(arxiv_id: str) -> Response:
    """Get the data needed to display the trackback page for an arXiv article.

    Parameters
    ----------
    arxiv_id : str

    Returns
    -------
    dict
        Response data.
    int
        HTTP status code.
    dict
        Headers to add to the response.

    Raises
    ------
    InternalServerError
        Raised when there was an unexpected problem executing the query.
    TrackbackNotFound
        Raised when trackbacks for an article cannot be found, either because
        the identifier is invalid or the article metadata is not available.

    """
    response_data: Dict[str, Any] = {}
    response_headers: Dict[str, Any] = {}
    if not arxiv_id:
        raise TrackbackNotFound(data={'missing_id': True})
    try:
        arxiv_identifier = Identifier(arxiv_id=arxiv_id)
        redirect = check_supplied_identifier(arxiv_identifier,
                                             'browse.tb')
        if redirect:
            return redirect
        response_data['arxiv_identifier'] = arxiv_identifier
        abs_meta = metadata.get_abs(arxiv_identifier.id)
        if abs_meta:
            response_data['abs_meta'] = abs_meta
        trackback_pings = get_paper_trackback_pings(arxiv_identifier.id)
        response_data['trackback_pings'] = trackback_pings
        if len(trackback_pings) > 0:
            response_data['author_links'] = \
                split_long_author_list(queries_for_authors(
                    abs_meta.authors.raw), truncate_author_list_size)
        response_status = status.HTTP_200_OK

    except AbsNotFoundException:
        raise TrackbackNotFound(data={'arxiv_id': arxiv_id, 'not_found': True})
    except (AbsException, IdentifierException):
        raise TrackbackNotFound(data={'arxiv_id': arxiv_id})
    except Exception as ex:
        logger.warning(f'Error getting trackbacks: {ex}')
        raise InternalServerError from ex

    return response_data, response_status, response_headers
Ejemplo n.º 7
0
    def test_include_inspire_link(self, mock_docmeta):
        """Tests for the include_inspire_link function."""
        mock_docmeta.arxiv_identifier = Identifier('1201.0001')
        mock_docmeta.primary_category = Category('hep-th')
        self.assertTrue(include_inspire_link(mock_docmeta))

        mock_docmeta.arxiv_identifier = Identifier('1212.0001')
        mock_docmeta.primary_category = Category('astro-ph.CO')
        self.assertFalse(include_inspire_link(mock_docmeta))

        mock_docmeta.arxiv_identifier = Identifier('1301.0001')
        mock_docmeta.primary_category = Category('astro-ph.CO')
        self.assertTrue(include_inspire_link(mock_docmeta))

        mock_docmeta.arxiv_identifier = Identifier('1806.01234')
        mock_docmeta.primary_category = Category('physics.ins-det')
        self.assertTrue(include_inspire_link(mock_docmeta))

        mock_docmeta.arxiv_identifier = Identifier('1212.0002')
        mock_docmeta.primary_category = Category('physics.gen-ph')
        self.assertFalse(include_inspire_link(mock_docmeta))
Ejemplo n.º 8
0
    def test_include_dblp_section(self, mock_docmeta):
        """Tests for the include_dblp_section fallback (from DB) function."""
        mock_docmeta.arxiv_identifier = Identifier('1806.00001')
        mock_docmeta.primary_archive = Archive('cs')
        self.assertTrue(include_dblp_section(mock_docmeta))
        self.assertEqual(get_computed_dblp_listing_path(mock_docmeta),
                         'db/journals/corr/corr1806.html#abs-1806-00001')

        mock_docmeta.arxiv_identifier = Identifier('cs/0501001')
        mock_docmeta.primary_archive = Archive('cs')
        self.assertTrue(include_dblp_section(mock_docmeta))
        self.assertEqual(get_computed_dblp_listing_path(mock_docmeta),
                         'db/journals/corr/corr0501.html#abs-cs-0501001')

        mock_docmeta.arxiv_identifier = Identifier('cs/0412001')
        mock_docmeta.primary_archive = Archive('cs')
        self.assertTrue(include_dblp_section(mock_docmeta))
        self.assertIsNone(get_computed_dblp_listing_path(mock_docmeta))

        mock_docmeta.arxiv_identifier = Identifier('1806.00002')
        mock_docmeta.primary_archive = Archive('math')
        self.assertFalse(include_dblp_section(mock_docmeta))
Ejemplo n.º 9
0
 def test_recent_trackback_pings(self) -> None:
     """Test if recent trackbacks can be retrieved."""
     tbs: List = TestBrowseDatabaseService.database_service.\
         get_recent_trackback_pings(max_trackbacks=-1)
     self.assertEqual(len(tbs), 0, 'List should be empty')
     tbs: List = TestBrowseDatabaseService.database_service.\
         get_recent_trackback_pings(max_trackbacks=25)
     self.assertGreater(len(tbs), 0, 'List should be nonempty')
     for tb in tbs:
         self.assertIsInstance(tb[0], TrackbackPing)
         self.assertIsInstance(tb[1], str)
         self.assertIsInstance(Identifier(arxiv_id=tb[1]), Identifier,
                               'Value looks like an Identifier')
         self.assertIsInstance(tb[2], str)
Ejemplo n.º 10
0
    def _previous_id(self, identifier: Identifier) -> Optional['Identifier']:
        """
        Get previous consecutive Identifier relative to provided Identifier.

        Parameters
        ----------
        identifier : :class:`Identifier`

        Returns
        -------
        :class:`Identifier`
            The previous Indentifier in sequence

        """
        previous_id = None
        if identifier.year is not None and \
                identifier.month is not None and \
                identifier.num is not None:
            new_year = identifier.year
            new_month = identifier.month
            new_num = identifier.num - 1
            if new_num == 0:
                new_month = new_month - 1
                if new_month == 0:
                    new_month = 12
                    new_year = new_year - 1

            if identifier.is_old_id:
                if new_num == 0:
                    new_num = 999
                previous_id = '{}/{:02d}{:02d}{:03d}'.format(
                    identifier.archive, new_year % 100, new_month, new_num)
            else:
                if new_year >= 2015:
                    if new_num == 0:
                        new_num = 99999
                    previous_id = '{:02d}{:02d}.{:05d}'.format(
                        new_year % 100, new_month, new_num)
                else:
                    if new_num == 0:
                        new_num = 9999
                    previous_id = '{:02d}{:02d}.{:04d}'.format(
                        new_year % 100, new_month, new_num)
            try:
                return Identifier(arxiv_id=previous_id)
            except IdentifierException:
                return None
        else:
            return None
Ejemplo n.º 11
0
    def _next_id(self, identifier: Identifier) -> Optional['Identifier']:
        """
        Get next consecutive Identifier relative to the provided Identifier.

        Parameters
        ----------
        identifier : :class:`Identifier`

        Returns
        -------
        :class:`Identifier`
            The next Indentifier in sequence

        """
        next_id = None
        if identifier.year is not None and \
                identifier.month is not None and \
                identifier.num is not None:
            new_year = identifier.year
            new_month = identifier.month
            new_num = identifier.num + 1
            if (identifier.is_old_id and new_num > 999) \
               or (not identifier.is_old_id
                   and identifier.year < 2015
                   and new_num > 9999) \
               or (not identifier.is_old_id
                   and identifier.year >= 2015 and new_num > 99999):
                new_num = 1
                new_month = new_month + 1
                if new_month > 12:
                    new_month = 1
                    new_year = new_year + 1

            if identifier.is_old_id:
                next_id = '{}/{:02d}{:02d}{:03d}'.format(
                    identifier.archive, new_year % 100, new_month, new_num)
            else:
                if new_year >= 2015:
                    next_id = '{:02d}{:02d}.{:05d}'.format(
                        new_year % 100, new_month, new_num)
                else:
                    next_id = '{:02d}{:02d}.{:04d}'.format(
                        new_year % 100, new_month, new_num)
            try:
                return Identifier(arxiv_id=next_id)
            except IdentifierException:
                return None
        else:
            return None
Ejemplo n.º 12
0
    def test_bad_identifiers(self) -> None:
        """Test known bad identifiers."""
        bad_ids = (
            'BAD_ID',
            'hep-th/990100',
            'hep-th/99010011',
            '0703.123',
            '0703.123456',
            '',
            '/',
            '0713.0001',
            '0800.0001'
            # ids ending 000 or .0000+ are not valid
            'quant-ph/9409000',
            '0704.0000',
            # other bad ids with different lengths of numbers
            'quant-ph/940900',
            'quant-ph/94090000',
            'quant-ph/94091',
            'quant-ph/940912',
            # P10k - add 123456 test
            'quant-ph/94091234',
            '0707.000',
            '0707.00000',
            '0707.1',
            '0707.12',
            '0707.123',
            '0707.123456',
            # double numbers (google makes these up?)
            '0705.35950705.3595v1/',
            'arxiv:0705.35950705.3595v1/',
            # non-numeric version
            '0707.2096va',
            '0707.2096va/',
            # pre-new-id
            '0612.1234',
            '0612.1234'
            '0703.1234',
            '0703.1234',
        )

        for bad_id in bad_ids:
            with self.assertRaises(
                    Exception,
                    msg=f'{bad_id} is an invalid identifier') as context:
                Identifier(arxiv_id=bad_id)

            self.assertIn('invalid arXiv identifier', str(context.exception))
Ejemplo n.º 13
0
    def get_abs(self, arxiv_id: str) -> DocMetadata:
        """
        Get the .abs metadata for the specified arXiv paper identifier.

        Parameters
        ----------
        arxiv_id : str
            The arXiv identifier string.

        Returns
        -------
        :class:`DocMetadata`

        """
        paper_id = Identifier(arxiv_id=arxiv_id)

        if paper_id.id in DELETED_PAPERS:
            raise AbsDeletedException(DELETED_PAPERS[paper_id.id])

        latest_version = self._get_version(identifier=paper_id)
        if not paper_id.has_version \
           or paper_id.version == latest_version.version:
            return dataclasses.replace(latest_version,
                                       is_definitive=True,
                                       is_latest=True)

        try:
            this_version = self._get_version(identifier=paper_id,
                                             version=paper_id.version)
        except AbsNotFoundException as e:
            if paper_id.is_old_id:
                raise
            else:
                raise AbsVersionNotFoundException(e)

        # Several fields need to reflect the latest version's data
        combined_version: DocMetadata = dataclasses.replace(
            this_version,
            version_history=latest_version.version_history,
            categories=latest_version.categories,
            primary_category=latest_version.primary_category,
            secondary_categories=latest_version.secondary_categories,
            primary_archive=latest_version.primary_archive,
            primary_group=latest_version.primary_group,
            is_definitive=True,
            is_latest=False)

        return combined_version
Ejemplo n.º 14
0
 def test_good_identifiers(self) -> None:
     """Test known good identifiers."""
     good_ids = {
         'hep-th/9901001': 'hep-th/9901001',
         '/hep-th/0702050': 'hep-th/0702050',
         '/hep-th/0702050v1': 'hep-th/0702050',
         '/0708.1234': '0708.1234',
         '/0708.1234v1': '0708.1234',
         'hep-th//9901001': 'hep-th/9901001',
         'hep-th///9901001': 'hep-th/9901001',
         'arxiv:hep-th/9901001': 'hep-th/9901001',
         'hep-th/9901001/extra': 'hep-th/9901001',
         'hep-th/9901001/ExTrA': 'hep-th/9901001',
         'HEP-TH/9901001/extra': 'hep-th/9901001',
         'HEP-TH/9901001/ExTrA': 'hep-th/9901001',
         '0704.0001': '0704.0001'
     }
     for provided_id, good_id in good_ids.items():
         gid = Identifier(arxiv_id=provided_id)
         self.assertIsInstance(gid, Identifier, 'valid instance')
         self.assertEqual(gid.id, good_id)
         self.assertEqual(gid.ids, provided_id)
Ejemplo n.º 15
0
def get_abs_page(arxiv_id: str) -> Response:
    """Get abs page data from the document metadata service.

    Parameters
    ----------
    arxiv_id : str
        The arXiv identifier as provided in the request.
    download_format_pref: str
        Download format preference.

    Returns
    -------
    dict
        Search result response data.
    int
        HTTP status code.
    dict
        Headers to add to the response.

    Raises
    ------
    :class:`.InternalServerError`
        Raised when there was an unexpected problem executing the query.
    """
    response_data: Dict[str, Any] = {}
    response_headers: Dict[str, Any] = {}
    try:
        arxiv_id = _check_legacy_id_params(arxiv_id)
        arxiv_identifier = Identifier(arxiv_id=arxiv_id)

        redirect = check_supplied_identifier(arxiv_identifier,
                                             'browse.abstract')
        if redirect:
            return redirect

        abs_meta = metadata.get_abs(arxiv_id)
        response_data['requested_id'] = arxiv_identifier.idv \
            if arxiv_identifier.has_version else arxiv_identifier.id
        response_data['abs_meta'] = abs_meta
        response_data['meta_tags'] = meta_tag_metadata(abs_meta)
        response_data['author_links'] = \
            split_long_author_list(queries_for_authors(
                abs_meta.authors.raw), truncate_author_list_size)
        response_data['url_for_author_search'] = \
            lambda author_query: url_for('search_archive',
                                         searchtype='author',
                                         archive=abs_meta.primary_archive.id,
                                         query=author_query)

        # Dissemination formats for download links
        download_format_pref = request.cookies.get('xxx-ps-defaults')
        add_sciencewise_ping = _check_sciencewise_ping(abs_meta.arxiv_id_v)
        response_data['formats'] = metadata.get_dissemination_formats(
            abs_meta, download_format_pref, add_sciencewise_ping)

        # Following are less critical and template must display without them
        # try:
        _non_critical_abs_data(abs_meta, arxiv_identifier, response_data)
        # except Exception:
        #    logger.warning("Error getting non-critical abs page data",
        #                   exc_info=app.debug)

    except AbsNotFoundException:
        if arxiv_identifier.is_old_id and arxiv_identifier.archive \
           in taxonomy.definitions.ARCHIVES:
            archive_name = taxonomy.definitions.ARCHIVES[
                arxiv_identifier.archive]['name']
            raise AbsNotFound(
                data={
                    'reason': 'old_id_not_found',
                    'arxiv_id': arxiv_id,
                    'archive_id': arxiv_identifier.archive,
                    'archive_name': archive_name
                })
        raise AbsNotFound(data={'reason': 'not_found', 'arxiv_id': arxiv_id})
    except AbsVersionNotFoundException:
        raise AbsNotFound(
            data={
                'reason': 'version_not_found',
                'arxiv_id': arxiv_identifier.idv,
                'arxiv_id_latest': arxiv_identifier.id
            })
    except AbsDeletedException as e:
        raise AbsNotFound(
            data={
                'reason': 'deleted',
                'arxiv_id_latest': arxiv_identifier.id,
                'message': e
            })
    except IdentifierIsArchiveException as e:
        raise AbsNotFound(data={
            'reason': 'is_archive',
            'arxiv_id': arxiv_id,
            'archive_name': e
        })
    except IdentifierException:
        raise AbsNotFound(data={'arxiv_id': arxiv_id})
    except AbsException as e:
        raise InternalServerError(
            'There was a problem. If this problem persists, please contact '
            '[email protected].') from e

    response_status = status.HTTP_200_OK

    not_modified = _check_request_headers(abs_meta, response_data,
                                          response_headers)
    if not_modified:
        return {}, status.HTTP_304_NOT_MODIFIED, response_headers

    return response_data, response_status, response_headers
Ejemplo n.º 16
0
    def parse_abs_file(filename: str) -> DocMetadata:
        """Parse arXiv .abs file."""
        try:
            with open(filename, mode='r', encoding='latin-1') as absf:
                raw = absf.read()
        except FileNotFoundError:
            raise AbsNotFoundException
        except UnicodeDecodeError as e:
            # TODO: log this
            raise AbsParsingException(
                f'Failed to decode .abs file "{filename}": {e}')

        # TODO: clean up
        modified = datetime.fromtimestamp(os.path.getmtime(filename),
                                          tz=gettz('US/Eastern'))
        modified = modified.astimezone(tz=tzutc())

        # there are two main components to an .abs file that contain data,
        # but the split must always return four components
        components = RE_ABS_COMPONENTS.split(raw)
        if not len(components) == 4:
            raise AbsParsingException(
                'Unexpected number of components parsed from .abs.')

        # everything else is in the second main component
        prehistory, misc_fields = re.split(r'\n\n', components[1])

        fields: Dict[str, Any] = \
            AbsMetaSession._parse_metadata_fields(key_value_block=misc_fields)

        # abstract is the first main component
        fields['abstract'] = components[2]

        id_match = RE_ARXIV_ID_FROM_PREHISTORY.match(prehistory)

        if not id_match:
            raise AbsParsingException(
                'Could not extract arXiv ID from prehistory component.')

        arxiv_id = id_match.group('arxiv_id')

        prehistory = re.sub(r'^.*\n', '', prehistory)
        parsed_version_entries = re.split(r'\n', prehistory)

        # submitter data
        from_match = RE_FROM_FIELD.match(parsed_version_entries.pop(0))
        if not from_match:
            raise AbsParsingException('Could not extract submitter data.')
        name = from_match.group('name')
        if name is not None:
            name = name.rstrip()
        email = from_match.group('email')

        # get the version history for this particular version of the document
        if not len(parsed_version_entries) >= 1:
            raise AbsParsingException('At least one version entry expected.')

        (version, version_history, arxiv_id_v) \
            = AbsMetaSession._parse_version_entries(
                arxiv_id=arxiv_id,
                version_entry_list=parsed_version_entries)

        arxiv_identifier = Identifier(arxiv_id=arxiv_id)

        # named (key-value) fields
        if not all(rf in fields for rf in REQUIRED_FIELDS):
            raise AbsParsingException(f'missing required field(s)')

        # some transformations
        category_list: List[str] = []
        primary_category = None

        if 'categories' in fields and fields['categories']:
            category_list = fields['categories'].split()
            if category_list[0] in taxonomy.CATEGORIES:
                primary_category = Category(id=category_list[0])
                primary_archive = \
                    Archive(
                        id=taxonomy.CATEGORIES[primary_category.id]['in_archive'])
            elif arxiv_identifier.is_old_id:
                primary_archive = \
                    Archive(id=arxiv_identifier.archive)  # type: ignore
        elif arxiv_identifier.is_old_id:
            primary_archive = \
                Archive(id=arxiv_identifier.archive)  # type: ignore
        else:
            raise AbsException('Cannot infer archive from identifier.')

        doc_license: License = \
            License() if 'license' not in fields else License(
                recorded_uri=fields['license'])
        raw_safe = re.sub(RE_FROM_FIELD, r'\g<from>\g<name>', raw, 1)

        return DocMetadata(
            raw_safe=raw_safe,
            arxiv_id=arxiv_id,
            arxiv_id_v=arxiv_id_v,
            arxiv_identifier=Identifier(arxiv_id=arxiv_id),
            title=fields['title'],
            abstract=fields['abstract'],
            authors=AuthorList(fields['authors']),
            submitter=Submitter(name=name, email=email),
            categories=fields['categories']
            if 'categories' in fields else None,
            primary_category=primary_category,
            primary_archive=primary_archive,
            primary_group=Group(
                id=taxonomy.ARCHIVES[primary_archive.id]['in_group']),
            secondary_categories=[
                Category(id=x) for x in category_list[1:]
                if (category_list and len(category_list) > 1)
            ],
            journal_ref=None
            if 'journal_ref' not in fields else fields['journal_ref'],
            report_num=None
            if 'report_num' not in fields else fields['report_num'],
            doi=None if 'doi' not in fields else fields['doi'],
            acm_class=None
            if 'acm_class' not in fields else fields['acm_class'],
            msc_class=None
            if 'msc_class' not in fields else fields['msc_class'],
            proxy=None if 'proxy' not in fields else fields['proxy'],
            comments=fields['comments'] if 'comments' in fields else None,
            version=version,
            license=doc_license,
            version_history=version_history,
            modified=modified
            # private=private  # TODO, not implemented
        )
Ejemplo n.º 17
0
def get_prevnext(request_params: MultiDict) -> Response:
    """
    Get the next or previous arXiv ID in the browse context.

    The 'id', 'function', and 'context' request parameters are required. The
    'site' parameter from the classic prevnext is no longer supported.

    Parameters
    ----------
    request_params : dict

    Returns
    -------
    dict
        Search result response data.
    int
        HTTP status code.
    dict
        Headers to add to the response.

    Raises
    ------
    InternalServerError
        Raised when there was an unexpected problem executing the query.
    BadRequest
        Raised when request parameters are missing, invalid, or when an ID
        redirect cannot be returned even when the request parameters are valid.

    """
    if 'id' not in request_params:
        raise BadRequest('Missing article identifier')
    try:
        arxiv_id = Identifier(request_params['id'])
    except IdentifierException:
        raise BadRequest(f"Invalid article identifier {request_params['id']}")

    if not ('function' in request_params
            and request_params['function'] in ['prev', 'next']):
        raise BadRequest('Missing or invalid function request')

    if 'context' not in request_params:
        raise BadRequest('Missing context')
    context = request_params['context']

    if not (context in CATEGORIES_ACTIVE or context in ARCHIVES
            or context == 'all'):
        raise BadRequest('Invalid context')

    is_next = request_params['function'] == 'next'
    try:
        seq_id = get_sequential_id(paper_id=arxiv_id,
                                   is_next=is_next,
                                   context=context)
    except Exception as ex:
        logger.warning(f'Error getting sequential ID: {ex}')
        raise InternalServerError from ex

    if not seq_id:
        raise BadRequest(
            f'No {"next" if is_next else "previous"} article found for '
            f'{arxiv_id.id} in {context}')

    redirect_url = url_for('browse.abstract', arxiv_id=seq_id, context=context)
    return {}, status.HTTP_301_MOVED_PERMANENTLY, {'Location': redirect_url}