Beispiel #1
0
 def fetch(self):
     doi_str = (doi.find_doi_in_text(self.uri) or doi.find_doi_in_text(
         urllib.request.urlopen(self.uri).read().decode('utf-8'))
                or self.uri)
     ctx = self.fetch_from_doi(doi_str)
     if ctx:
         if ctx.data:
             self.ctx.data = ctx.data
         if ctx.files:
             self.ctx.files = ctx.files
             return
     self.get_files()
Beispiel #2
0
def test_find_doi_in_line() -> None:
    test_data = [
        ('http://dx.doi.org/10.1063/1.881498', '10.1063/1.881498'),
        ('http://dx.doi.org/10.1063%2F1.881498', '10.1063/1.881498'),
        (2 * 'qer ' + 'var doi = "12345/12345.3"', '12345/12345.3'),
        (2 * 'qer ' + "var doi = '12345/12345.3';fas", '12345/12345.3'),
        (2 * 'qer ' + "var DoI = 12345%2F12345.3", '12345/12345.3'),
        (2 * 'qer ' + "var DoI : 12345%2F12345.3", '12345/12345.3'),
        ('http://scitation.org/doi/10.1063/1.881498', '10.1063/1.881498'),
        ('org/doi(10.1063/1.881498)', '10.1063/1.881498'),
        ('/scitation.org/doi/10.1063/1.881498?234saf=34', '10.1063/1.881498'),
        ('/scitation.org/doi/10.1063/1.88149 8?234saf=34', '10.1063/1.88149'),
        ('/scitation.org/doi/10.1063/1.uniau12?as=234', '10.1063/1.uniau12'),
        ('https://doi.org/10.1093/analys/anw053', '10.1093/analys/anw053'),
        ('http://.scitation.org/doi/10.1063/1.mart(88)1498?asdfwer',
         '10.1063/1.mart(88)1498'),
        ('@ibook{doi:10.1002/9780470125915.ch2,', '10.1002/9780470125915.ch2'),
        ('<rdf:Description rdf:about="" xmlns:dc="http://purl.org/dc/elements'
         '.1/"><dc:format>application/pdf</dc:format><dc:identifier>'
         'doi:10.1063/1.5079474</dc:identifier></rdf:Description>',
         '10.1063/1.5079474'),
        ('<(DOI:10.1002/9780470915.CH2)/S/URI,', '10.1002/9780470915.CH2'),
        ('URL<(DOI:10.1002/9780470125915.CH2,', '10.1002/9780470125915.CH2'),
        (r'A<</S/URI/URI(https://doi.org/10.1016/j.comptc.2018.10.004)>>/'
         r'Border[0 0 0]/M(D:20181022082356+0530)/Rect[147.40158 594.36926'
         r'347.24957 605.36926]/Subtype/Link/Type/A',
         '10.1016/j.comptc.2018.10.004'),
        ('doi(10.1038/s41535-018-0103-6;)', '10.1038/s41535-018-0103-6'),
    ]
    for url, doi in test_data:
        assert find_doi_in_text(url) == doi
    def _search_direct_url(self):
        """
        Sci-Hub embeds papers in an iframe. This function finds the actual
        source url which looks something like

            https://moscow.sci-hub.io/.../....pdf.
        """

        logger.debug('pinging {0}'.format(self.base_url))
        ping = self.session.get(self.base_url, timeout=1, verify=False)
        if not ping.status_code == 200:
            logger.error('server {0} is down '.format(self.base_url))
            return None

        logger.debug('server {0} is up'.format(self.base_url))
        url = "{0}{1}".format(self.base_url, self.uri)
        logger.debug('scihub url {0}'.format(url))
        res = self.session.get(url, verify=False)
        logger.debug('Scraping scihub site')
        logger.debug('trying to get doi')
        self.doi = doi.find_doi_in_text(res.content.decode('utf8')) or ''
        if self.doi:
            logger.info('found a doi candidate {0}'.format(self.doi))
        s = BeautifulSoup(res.content, 'html.parser')
        iframe = s.find('iframe')
        if iframe:
            logger.debug('iframe found in scihub\'s html')
            return (
                iframe.get('src')
                if not iframe.get('src').startswith('//')
                else 'https:' + iframe.get('src')
            )
Beispiel #4
0
 def fetch(self) -> None:
     _doi = doi.find_doi_in_text(self.uri)
     if _doi is None:
         return None
     importer = Importer(uri=_doi)
     importer.fetch()
     self.ctx = importer.ctx
Beispiel #5
0
 def get_doi(self) -> Optional[str]:
     if self.ctx.data and 'doi' in self.ctx.data:
         _doi = self.ctx.data['doi']
         return str(_doi) if _doi else None
     soup = self._get_soup()
     self.logger.info('Trying to parse doi from url body...')
     if soup:
         return doi.find_doi_in_text(str(soup))
     else:
         return None
Beispiel #6
0
 def match(cls, uri: str) -> Optional[papis.downloaders.Downloader]:
     if doi.find_doi_in_text(uri):
         return Downloader(uri)
     else:
         return None
Beispiel #7
0
 def fetch(self):
     importer = Importer(uri=doi.find_doi_in_text(self.uri))
     importer.fetch()
     self.ctx = importer.ctx
Beispiel #8
0
 def match(cls, uri):
     if doi.find_doi_in_text(uri):
         return Downloader(uri)
     else:
         return None
Beispiel #9
0
 def get_doi(self):
     if 'doi' in self.ctx.data:
         return self.ctx.data['doi']
     soup = self._get_soup()
     self.logger.info('trying to parse doi...')
     return doi.find_doi_in_text(str(soup))