def resolve(self, citations, document=None): citation = {} if not utopia.citation.has_link(citations, {'mime': 'application/pdf'}, {'whence': 'wiley'}): pdf_links = utopia.citation.filter_links( citations, {'mime': 'application/pdf'}) for link in pdf_links: url = link['url'] if 'onlinelibrary.wiley.com' in url: parser = etree.HTMLParser() resource = urllib2.urlopen(url, timeout=12) html = resource.read() dom = etree.parse(StringIO(html), parser) # look for the PDF link download_pdf_urls = dom.xpath( '//iframe[@id="pdfDocument"]/@src') for pdf_url in download_pdf_urls: pdf_url = urlparse.urljoin(url, pdf_url) if pdf_url != resource.geturl( ): # Check for cyclic references citation.setdefault('links', []) citation['links'].append({ 'url': pdf_url, 'mime': 'application/pdf', 'type': 'article', 'title': 'Download article from Wiley', }) return citation
def resolve(self, citations, document=None): citation = {} if not utopia.citation.has_link(citations, {'mime': 'application/pdf'}, {'whence': 'acs'}): resolved_links = utopia.citation.filter_links( citations, {'resolved_url': None}) for link in resolved_links: url = link['resolved_url'] if 'pubs.acs.org' in url: parser = etree.HTMLParser() resource = urllib2.urlopen(url, timeout=12) html = resource.read() dom = etree.parse(StringIO(html), parser) # look for the PDF link download_pdf_urls = dom.xpath( '//div[@class="bottomViewLinks"]/a[text()="PDF"]/@href' ) for pdf_url in download_pdf_urls: pdf_url = urlparse.urljoin(url, pdf_url) if pdf_url != resource.geturl( ): # Check for cyclic references citation.setdefault('links', []) citation['links'].append({ 'url': pdf_url, 'mime': 'application/pdf', 'type': 'article', 'title': 'Download article from ACS', }) return citation
def resolve(self, citations, document=None): citation = {} if document is not None: # First try to scrape the title if 'title' not in citations: title = utopia.tools.title.scrape(document) if title is not None: print('scraper: title: ' + (title and title.encode('utf8'))) citation['title'] = title # Then look for a DOI and ArXiv ID ids = utopia.citation.pick_from(citations, 'identifiers', {}) if 'doi' not in ids: doi = utopia.tools.doi.scrape(document) if doi is not None: print('scraper: doi:' + (doi and doi.encode('utf8'))) citation.setdefault('identifiers', {}) citation['identifiers']['doi'] = doi if 'arxiv' not in ids: arxivid = utopia.tools.arxiv.scrape(document) if arxivid is not None: print('scraper: arxivid:' + (arxivid and arxivid.encode('utf8'))) citation.setdefault('identifiers', {}) citation['identifiers']['arxiv'] = arxivid return citation
def resolve(self, citations, document=None): citation = {} if not utopia.citation.has_link(citations, {'mime': 'application/pdf'}, {'whence': 'nature'}): resolved_links = utopia.citation.filter_links( citations, {'resolved_url': None}) for link in resolved_links: url = link['resolved_url'] if 'www.nature.com' in url: parser = etree.HTMLParser() resource = urllib2.urlopen(url, timeout=12) html = resource.read() dom = etree.parse(StringIO(html), parser) # look for the PDF link download_pdf_urls = dom.xpath( '//li[@class="download-pdf"]/a/@href') for pdf_url in download_pdf_urls: pdf_url = urlparse.urljoin(url, pdf_url) if pdf_url != resource.geturl( ): # Check for cyclic references citation.setdefault('links', []) citation['links'].append({ 'url': pdf_url, 'mime': 'application/pdf', 'type': 'article', 'title': 'Download article from Nature', }) # look for the supplementary PDF link(s) for supp in dom.xpath( '//div[@id="supplementary-information"]//dl'): download_supp_pdf_urls = supp.xpath('//dt/a/@href') for pdf_url in download_supp_pdf_urls: pdf_url = urlparse.urljoin(url, pdf_url) if pdf_url != resource.geturl( ): # Check for cyclic references citation.setdefault('links', []) citation['links'].append({ 'url': pdf_url, 'mime': 'application/pdf', 'type': 'supplementary', 'title': 'Download supplementary information from Nature', }) return citation
def resolve(self, citations, document=None): citation = {} if not utopia.citation.has_link(citations, {'mime': 'application/pdf'}, {'whence': 'ieee'}): resolved_links = utopia.citation.filter_links( citations, {'resolved_url': None}) for link in resolved_links: url = link['resolved_url'] if 'ieeexplore.ieee.org' in url: parser = etree.HTMLParser() resource = urllib2.urlopen(url, timeout=12) html = resource.read() dom = etree.parse(StringIO(html), parser) # look for the PDF link download_pdf_urls = dom.xpath( '//a[@id="full-text-pdf"]/@href') for pdf_url in download_pdf_urls: pdf_url = urlparse.urljoin(url, pdf_url) if pdf_url != resource.geturl( ): # Check for cyclic references # follow the link and find the iframe src resource = urllib2.urlopen(pdf_url, timeout=12) html = resource.read() dom = etree.parse(StringIO(html), parser) # developing time-frequency features for prediction download_pdf_urls = dom.xpath( "//frame[contains(@src, 'pdf')]/@src") for pdf_url in download_pdf_urls: pdf_url = urlparse.urljoin(url, pdf_url) citation.setdefault('links', []) citation['links'].append({ 'url': pdf_url, 'mime': 'application/pdf', 'type': 'article', 'title': 'Download article from IEEEXplore', }) return citation
def resolve(self, citations, document=None): citation = {} if not utopia.citation.has_link(citations, {'mime': 'application/pdf'}, {'whence': 'html'}): article_links = utopia.citation.filter_links( citations, { 'type': 'article', 'mime': 'text/html' }) for article_link in article_links: url = article_link['url'] parser = etree.HTMLParser() try: request = urllib2.Request( url, headers={'Accept-Content': 'gzip'}) resource = urllib2.urlopen(request, timeout=12) except urllib2.HTTPError as e: if e.getcode() == 401: resource = e else: raise html = resource.read() article_link['resolved_url'] = resource.geturl( ) # FIXME should modification of previous citations be allowed? dom = etree.parse(StringIO(html), parser) # look for the PDF link citations_pdf_urls = dom.xpath( '/html/head/meta[@name="citations_pdf_url"]/@content') for pdf_url in citations_pdf_urls: if pdf_url != resource.geturl( ): # Check for cyclic references citation.setdefault('links', []) citation['links'].append({ 'url': pdf_url, 'mime': 'application/pdf', 'type': 'article', 'title': 'Download article', }) return citation