Esempio n. 1
0
    def test_repeating(self, datadir, event_loop):
        set_config({'unify_booktitle': [
            [r'(?P<prefix>.*) remove(?P<suffix>.*)',
                r'{prefix}{suffix}', 'priority:50', 'repeat', 'kind:plain'],
        ]})

        unify_me = make_entry(
            {'booktitle':
             ('Proceedings remove of remove some remove conference')})

        u = Unifier()
        sugg = u.unify_entry(unify_me)

        assert (sugg.data['booktitle'][0] ==
                ((r'Proceedings of some conference'),
                 Suggestion.KIND_PLAIN))

        # Test repeat-unifying suggestion
        sugg = Suggestion('test', unify_me)
        sugg.add_field('booktitle',
                       'Proceedings remove of remove some remove conference')

        u.unify_suggestion(sugg)

        assert(sugg.data['booktitle'][0] ==
               ('Proceedings of some conference', Suggestion.KIND_PLAIN))
Esempio n. 2
0
    def test_chaining(self, datadir, event_loop):
        set_config({'unify_booktitle': [
            [r'(?P<prefix>.*)first(?P<suffix>.*)',
                r'{prefix}1st{suffix}', 'kind:plain', 'priority:50'],
            [r'(?P<prefix>.*) IEEE(?P<suffix>.*)',
                r'{prefix}{suffix}', 'kind:regex']
        ]})

        unify_me = make_entry(
            {'booktitle':
             ('Proceedings of the first IEEE conference on whatever')})

        u = Unifier()
        sugg = u.unify_entry(unify_me)

        assert (sugg.data['booktitle'][0] ==
                ((r'Proceedings of the 1st conference on whatever'),
                 Suggestion.KIND_RE))

        # Test chain-unifying suggestion
        sugg = Suggestion('test', unify_me)
        sugg.add_field('booktitle',
                       ('Proceedings of the first'
                        ' IEEE conference on whatever'))
        u.unify_suggestion(sugg)

        assert(sugg.data['booktitle'][0] ==
               (r'Proceedings of the 1st conference on whatever',
                Suggestion.KIND_RE))
Esempio n. 3
0
    def test_unify_suggestion(self, datadir, event_loop):
        set_config({'unify_booktitle': [
            [r'\d{4} IEEE (?P<name>[^\(]*) \((?P<short>[^\)]*)\)',
             r'Proceedings of the \d*(th|st|nd|rd) {name} \({short}.*\)'],
        ]})

        dummy_entry = make_entry({})
        sugg = Suggestion('test', dummy_entry)
        sugg.add_field('booktitle',
                       ('2016 IEEE International Parallel and'
                        ' Distributed Processing Symposium (IPDPS)'))

        u = Unifier()
        u.unify_suggestion(sugg)

        assert (sugg.data['booktitle'][0] ==
                (r'Proceedings of the \d*(th|st|nd|rd)'
                 r' International Parallel and Distributed'
                 r' Processing Symposium \(IPDPS.*\)',
                 Suggestion.KIND_RE))
Esempio n. 4
0
    def _query_blocking(self, entry, provider):
        isbn = entry.data.get('isbn')

        # Okay, we're actually going to make a HTTP request
        self._ratelimit.get()

        if not isbn:
            self._ui.finish_subtask('ISBNQuery')
            return None

        if notisbn(isbn):
            self._ui.finish_subtask('ISBNQuery')
            return (None, "{} is not a valid ISBN.".format(isbn))

        try:
            bibtex_data = self._formatter(meta(isbn, service=provider))
        except ISBNLibException as e:
            self._ui.finish_subtask('ISBNQuery')
            return (None, e)
        except socket.timeout:
            self._ui.finish_subtask('ISBNQuery')
            raise RetrievalProblem("Socket timeout during"
                                   " ISBN metadata retrieval")

        try:
            parsed_data = bibtexparser.loads(bibtex_data)
        except:
            self._ui.finish_subtask('ISBNQuery')
            raise RetrievalProblem("Data from ISBN source could not be parsed")

        if len(parsed_data.entries) != 1:
            self._ui.finish_subtask('ISBNQuery')
            raise RetrievalProblem(
                "ISBN search did not return exactly one result.")

        retrieved = Entry(parsed_data.entries[0], self._ui)
        s = Suggestion("isbn_{}".format(provider), entry)
        for (k, v) in retrieved.data.items():
            if k.lower() == 'id':
                continue
            s.add_field(k, v)

        for (first, last) in s.authors:
            s.add_author(first, last)

        for (first, last) in s.editors:
            s.add_editor(first, last)

        return (s, None)
Esempio n. 5
0
    def test_re_suggestion(self, datadir):
        e = make_entry({
            'title':
            'This is some title.',
            'booktitle':
            "Proceedings of the 20th Conference on Something Awesome (CSA'20)"
        })

        s = Suggestion('test', e)
        s.add_field('title', 'This is some title.', kind=Suggestion.KIND_RE)
        s.add_field('booktitle',
                    r'Proceedings of the \d+(th|st|rd|nd) .* \(.*\)',
                    kind=Suggestion.KIND_RE)

        d = Differ(e)
        result = d.diff(s)
        assert result == []

        s = Suggestion('nonmatching_test', e)
        s.add_field('booktitle', r'Nope', kind=Suggestion.KIND_RE)
        result = d.diff(s)
        assert len(result) == 1
Esempio n. 6
0
    def test_list_ignore_order(self, datadir):
        e = make_entry({
            'title': 'This is some title.',
            'issn': '1234-5678, 2345-67890'
        })

        s = Suggestion('test', e)
        s.add_field('title', 'This is some title.')
        s.add_field('issn', '2345-67890, 1234-5678')

        d = Differ(e)
        result = d.diff(s)
        assert result == []
Esempio n. 7
0
    async def _execute_query(self, entry, url, retry_number=0):
        if not url:
            self._ui.finish_subtask('MetaQuery')
            return None

        # Okay, we're actually going to make a HTTP request
        await self._ratelimit.get()

        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url,
                                       headers=MetaSource.HEADERS) as resp:
                    status = resp.status
                    if status == 403:
                        try:
                            html = await resp.text()
                            if self._detect_captcha(html):
                                self._ui.finish_subtask('MetaQuery')
                                LOGGER.info(
                                    (f"URL {url} requires a captcha to "
                                     "be solved. Giving up."))
                                raise RetrievalProblem(
                                    (f"URL {url} requires a "
                                     "captcha to be solved.")
                                )
                        except:
                            pass

                        if retry_number == self._max_retries:
                            self._ui.finish_subtask('MetaQuery')
                            raise RetrievalProblem(
                                (f"URL {url} still results in 403 "
                                 f"after {self._max_retries} retries."
                                 " Giving up."))
                        LOGGER.debug((f"Got a 403 while accessing {url}."
                                      f" Backing off. "
                                      f"Retry {retry_number+1}..."))
                        await self._ratelimit.backoff()
                        await asyncio.sleep(self._retry_pause)
                        return await self._execute_query(entry, url,
                                                         retry_number+1)

                    if status != 200:
                        self._ui.finish_subtask('MetaQuery')
                        raise RetrievalProblem(
                            "Accessing URL {} returns status {}"
                            .format(url, status))

                    try:
                        html = await resp.text()
                    except UnicodeDecodeError:
                        self._ui.finish_subtask('MetaQuery')
                        raise RetrievalProblem(
                            f"Content at URL {url} could not be interpreted")

                    parser = MetadataHTMLParser(self._ui, str(resp.url))
                    parser.feed(html)

                    sugg = Suggestion("meta", entry)

                    for (k, v) in parser.get_metadata().items():
                        if isinstance(v, list):
                            sugg.add_field(k,
                                           [remove_tags(vi) for vi in v])
                        else:
                            sugg.add_field(k, remove_tags(v))

                    for (first, last) in parser.get_authors():
                        sugg.add_author(first, last)

                    self._ui.finish_subtask('MetaQuery')
                    return sugg
        except asyncio.TimeoutError:
            self._ui.finish_subtask('MetaQuery')
            LOGGER.error(f"Timeout trying to retrieve URL {url}")
            raise RetrievalProblem(
                f"Timeout trying to retrieve URL {url}")
Esempio n. 8
0
    def _query_blocking(self, entry):
        doi = entry.get_probable_doi()
        if not doi:
            self._ui.finish_subtask('CrossrefQuery')
            return None

        try:
            data = crossref_commons.retrieval.get_publication_as_json(doi)
        except ValueError as e:
            self._ui.finish_subtask('CrossrefQuery')
            if str(e) == f"DOI {doi} does not exist":
                # This isn't really an error, CrossRef just does not know
                # about them
                pass
            else:
                LOGGER.error((f"Error retrieving data for {entry.get_id()}. "
                              f"{e}"))
            return None
        except ConnectionError as e:
            # TODO retry?
            self._ui.finish_subtask('CrossrefQuery')
            LOGGER.error(
                (f"Connection error retrieving data for {entry.get_id()}. "
                 f"{e}"))
            return None

        s = Suggestion("crossref", entry)

        # Special handling for type
        btype = TYPE_MAPPING.get(data['type'])
        if not btype:
            LOGGER.warn(
                "Type {} not found in crossref source. (Entry {})".format(
                    data['type'], entry.get_id()))
        else:
            s.add_field('entrytype', btype)

        # Special handling for authors
        for author_data in data.get('author', []):
            s.add_author(
                author_data.get('given', "").strip(),
                author_data.get('family', "").strip())

        # Special handling for editors
        for editor_data in data.get('editor', []):
            s.add_editor(
                editor_data.get('given', "").strip(),
                editor_data.get('family', "").strip())

        # Special handling for journal / book title
        if btype in ['journal-article', 'book-chapter']:
            journal = flexistrip(data.get('container-title'))
            if journal:
                s.add_field('journal', journal)

        # Special handling for URL. Only take it if it's not a DOI-Url
        url = flexistrip(data.get('URL'))
        if url and (CrossrefSource.DOI_URL_RE.match(url) is None):
            s.add_field('url', url)

        # All other fields
        for field_from, field_to in FIELD_MAPPING.items():
            if isinstance(field_to, dict):
                if entry.data['entrytype'] in field_to:
                    field_to = field_to[entry.data['entrytype']]
                else:
                    field_to = field_to.get('default')

            if not field_to:
                continue

            if field_from in data:
                s.add_field(field_to, flexistrip(data[field_from]))

        self._ui.finish_subtask('CrossrefQuery')
        return s
Esempio n. 9
0
    def _query_blocking(self, entry):
        doi = entry.get_probable_doi()

        if not doi:
            self._ui.finish_subtask('DataCiteQuery')
            return None

        # Okay, we're actually going to make a HTTP request
        self._ratelimit.get()

        url = "https://api.datacite.org/dois/{}".format(
            urllib.parse.quote(doi))
        response = requests.get(url)

        if response.status_code != 200:
            self._ui.finish_subtask('DataCiteQuery')
            return None

        try:
            data = response.json()
        except ValueError:
            LOGGER.warn("Response did not contain JSON")
            self._ui.finish_subtask('DataCiteQuery')
            return None

        if 'errors' in data:
            self._ui.finish_subtask('DataCiteQuery')
            return None

        attrs = data['data']['attributes']

        s = Suggestion('datacite', entry)

        # Authors
        for i in range(0, len(attrs['creators'])):
            adata = attrs['creators'][i]
            if 'givenName' in adata and 'familyName' in adata:
                s.add_author(adata['givenName'], adata['familyName'])

        # Editors
        for i in range(0, len(attrs['contributors'])):
            adata = attrs['contributors'][i]
            if adata.get('contributorType') == 'Editor':
                if 'givenName' in adata and 'familyName' in adata:
                    s.add_editor(adata['givenName'], adata['familyName'])

        # Title…s?
        # TODO what happens if there are multiple titles?
        if path_exists(attrs, ('titles', 0, 'title')):
            s.add_field('title', attrs['titles'][0]['title'])

        if 'publisher' in attrs:
            s.add_field('publisher', attrs['publisher'])

        if 'publicationYear' in attrs:
            s.add_field('year', attrs['publicationYear'])

        if 'url' in attrs:
            s.add_field('url', attrs['url'])

        ctype = None
        if path_exists(attrs, ('container', 'type')):
            ctype = attrs['container']['type']
            cdata = attrs['container']

        if ctype == 'Journal':
            if 'title' in cdata:
                s.add_field('journal', cdata['title'])
        elif ctype == 'Book Series':
            if 'title' in cdata:
                s.add_field('booktitle', cdata['title'])

        if ctype in ('Journal', 'Book Series'):
            if 'volume' in cdata:
                s.add_field('volume', cdata['volume'])
            if 'issue' in cdata:
                s.add_field('issue', cdata['issue'])
            if cdata.get('identifierType') == 'ISSN':
                s.add_field('issn', cdata['identifier'])
            if 'firstPage' in cdata and 'lastPage' in cdata:
                s.add_field(
                    'pages', '{}--{}'.format(cdata['firstPage'],
                                             cdata['lastPage']))

        if path_exists(attrs, ('type', 'bibtex')):
            s.add_field('ENTRYTYPE', attrs['type']['bibtex'])

        self._ui.finish_subtask('DataCiteQuery')
        return s