Exemple #1
0
 def resolve(self, citations, document=None):
     citation = {}
     if not utopia.citation.has_link(citations, {'mime': 'application/pdf'},
                                     {'whence': 'wiley'}):
         pdf_links = utopia.citation.filter_links(
             citations, {'mime': 'application/pdf'})
         for link in pdf_links:
             url = link['url']
             if 'onlinelibrary.wiley.com' in url:
                 parser = etree.HTMLParser()
                 resource = urllib2.urlopen(url, timeout=12)
                 html = resource.read()
                 dom = etree.parse(StringIO(html), parser)
                 # look for the PDF link
                 download_pdf_urls = dom.xpath(
                     '//iframe[@id="pdfDocument"]/@src')
                 for pdf_url in download_pdf_urls:
                     pdf_url = urlparse.urljoin(url, pdf_url)
                     if pdf_url != resource.geturl(
                     ):  # Check for cyclic references
                         citation.setdefault('links', [])
                         citation['links'].append({
                             'url':
                             pdf_url,
                             'mime':
                             'application/pdf',
                             'type':
                             'article',
                             'title':
                             'Download article from Wiley',
                         })
     return citation
    def resolve(self, citations, document=None):
        citation = {}
        if not utopia.citation.has_link(citations, {'mime': 'application/pdf'},
                                        {'whence': 'acs'}):
            resolved_links = utopia.citation.filter_links(
                citations, {'resolved_url': None})
            for link in resolved_links:
                url = link['resolved_url']
                if 'pubs.acs.org' in url:
                    parser = etree.HTMLParser()
                    resource = urllib2.urlopen(url, timeout=12)
                    html = resource.read()
                    dom = etree.parse(StringIO(html), parser)

                    # look for the PDF link
                    download_pdf_urls = dom.xpath(
                        '//div[@class="bottomViewLinks"]/a[text()="PDF"]/@href'
                    )
                    for pdf_url in download_pdf_urls:
                        pdf_url = urlparse.urljoin(url, pdf_url)
                        if pdf_url != resource.geturl(
                        ):  # Check for cyclic references
                            citation.setdefault('links', [])
                            citation['links'].append({
                                'url':
                                pdf_url,
                                'mime':
                                'application/pdf',
                                'type':
                                'article',
                                'title':
                                'Download article from ACS',
                            })
        return citation
Exemple #3
0
 def resolve(self, citations, document=None):
     citation = {}
     if document is not None:
         # First try to scrape the title
         if 'title' not in citations:
             title = utopia.tools.title.scrape(document)
             if title is not None:
                 print('scraper: title: ' +
                       (title and title.encode('utf8')))
                 citation['title'] = title
         # Then look for a DOI and ArXiv ID
         ids = utopia.citation.pick_from(citations, 'identifiers', {})
         if 'doi' not in ids:
             doi = utopia.tools.doi.scrape(document)
             if doi is not None:
                 print('scraper: doi:' + (doi and doi.encode('utf8')))
                 citation.setdefault('identifiers', {})
                 citation['identifiers']['doi'] = doi
         if 'arxiv' not in ids:
             arxivid = utopia.tools.arxiv.scrape(document)
             if arxivid is not None:
                 print('scraper: arxivid:' +
                       (arxivid and arxivid.encode('utf8')))
                 citation.setdefault('identifiers', {})
                 citation['identifiers']['arxiv'] = arxivid
     return citation
 def resolve(self, citations, document=None):
     citation = {}
     if not utopia.citation.has_link(citations, {'mime': 'application/pdf'},
                                     {'whence': 'nature'}):
         resolved_links = utopia.citation.filter_links(
             citations, {'resolved_url': None})
         for link in resolved_links:
             url = link['resolved_url']
             if 'www.nature.com' in url:
                 parser = etree.HTMLParser()
                 resource = urllib2.urlopen(url, timeout=12)
                 html = resource.read()
                 dom = etree.parse(StringIO(html), parser)
                 # look for the PDF link
                 download_pdf_urls = dom.xpath(
                     '//li[@class="download-pdf"]/a/@href')
                 for pdf_url in download_pdf_urls:
                     pdf_url = urlparse.urljoin(url, pdf_url)
                     if pdf_url != resource.geturl(
                     ):  # Check for cyclic references
                         citation.setdefault('links', [])
                         citation['links'].append({
                             'url':
                             pdf_url,
                             'mime':
                             'application/pdf',
                             'type':
                             'article',
                             'title':
                             'Download article from Nature',
                         })
                 # look for the supplementary PDF link(s)
                 for supp in dom.xpath(
                         '//div[@id="supplementary-information"]//dl'):
                     download_supp_pdf_urls = supp.xpath('//dt/a/@href')
                     for pdf_url in download_supp_pdf_urls:
                         pdf_url = urlparse.urljoin(url, pdf_url)
                         if pdf_url != resource.geturl(
                         ):  # Check for cyclic references
                             citation.setdefault('links', [])
                             citation['links'].append({
                                 'url':
                                 pdf_url,
                                 'mime':
                                 'application/pdf',
                                 'type':
                                 'supplementary',
                                 'title':
                                 'Download supplementary information from Nature',
                             })
     return citation
    def resolve(self, citations, document=None):
        citation = {}
        if not utopia.citation.has_link(citations, {'mime': 'application/pdf'},
                                        {'whence': 'ieee'}):
            resolved_links = utopia.citation.filter_links(
                citations, {'resolved_url': None})
            for link in resolved_links:
                url = link['resolved_url']
                if 'ieeexplore.ieee.org' in url:
                    parser = etree.HTMLParser()
                    resource = urllib2.urlopen(url, timeout=12)
                    html = resource.read()
                    dom = etree.parse(StringIO(html), parser)

                    # look for the PDF link
                    download_pdf_urls = dom.xpath(
                        '//a[@id="full-text-pdf"]/@href')
                    for pdf_url in download_pdf_urls:
                        pdf_url = urlparse.urljoin(url, pdf_url)
                        if pdf_url != resource.geturl(
                        ):  # Check for cyclic references
                            # follow the link and find the iframe src
                            resource = urllib2.urlopen(pdf_url, timeout=12)
                            html = resource.read()
                            dom = etree.parse(StringIO(html), parser)

                            # developing time-frequency features for prediction
                            download_pdf_urls = dom.xpath(
                                "//frame[contains(@src, 'pdf')]/@src")
                            for pdf_url in download_pdf_urls:
                                pdf_url = urlparse.urljoin(url, pdf_url)
                                citation.setdefault('links', [])
                                citation['links'].append({
                                    'url':
                                    pdf_url,
                                    'mime':
                                    'application/pdf',
                                    'type':
                                    'article',
                                    'title':
                                    'Download article from IEEEXplore',
                                })
        return citation
    def resolve(self, citations, document=None):
        citation = {}
        if not utopia.citation.has_link(citations, {'mime': 'application/pdf'},
                                        {'whence': 'html'}):
            article_links = utopia.citation.filter_links(
                citations, {
                    'type': 'article',
                    'mime': 'text/html'
                })
            for article_link in article_links:
                url = article_link['url']
                parser = etree.HTMLParser()
                try:
                    request = urllib2.Request(
                        url, headers={'Accept-Content': 'gzip'})
                    resource = urllib2.urlopen(request, timeout=12)
                except urllib2.HTTPError as e:
                    if e.getcode() == 401:
                        resource = e
                    else:
                        raise

                html = resource.read()
                article_link['resolved_url'] = resource.geturl(
                )  # FIXME should modification of previous citations be allowed?
                dom = etree.parse(StringIO(html), parser)

                # look for the PDF link
                citations_pdf_urls = dom.xpath(
                    '/html/head/meta[@name="citations_pdf_url"]/@content')
                for pdf_url in citations_pdf_urls:
                    if pdf_url != resource.geturl(
                    ):  # Check for cyclic references
                        citation.setdefault('links', [])
                        citation['links'].append({
                            'url': pdf_url,
                            'mime': 'application/pdf',
                            'type': 'article',
                            'title': 'Download article',
                        })
        return citation