def after_load_event(self, document): ''' Provide formatted citations for the current document. This should be done quite late in the process, in case previous handlers have enriched the metadata. ''' # Start by getting all the best-trusted metadata for this document metadata = {} for key in self.properties + self.identifiers: value = common.utils.metadata(document, key) if value is not None: if key[-2:] == '[]': key = key[:-2] metadata[key] = value # Only if there's some metadata to display FIXME if len(metadata) > 0: # Now create a citation formatter annotation for the sidebar annotation = spineapi.Annotation() annotation['concept'] = 'CitationFormatter' annotation['property:json'] = json.dumps(metadata) annotation['property:name'] = 'Formatted Citation' annotation['property:description'] = "How to cite this document" annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/csl.png', 'image/png') annotation['property:sourceDescription'] = ''' Uses <a href="https://bitbucket.org/fbennett/citeproc-js/wiki/Home">citeproc-js</a> to format the citation. ''' annotation['session:weight'] = '10' annotation['session:default'] = '1' annotation['session:volatile'] = '1' document.addAnnotation(annotation)
def on_load_event(self, document): outline={} for a in document.annotations(): if a.get('concept') == 'OutlineItem': outline[tuple([int(x) for x in a.get('property:outlinePosition').split('.')])]=a if len(outline): # html='<div><style media="screen" type="text/css">ul { list-style-type: none; }</style><ul>' html='<div><ul>' plen=1 for item in (sorted(outline.keys())): if len(item) > plen: html+='<ul><li>' elif len(item) < plen: html+='</li></ul></li><li>' else: html+='</li><li>' plen=len(item) html += '<a href="#" title="{0}" target="pdf; anchor={0}">{1}</a>'.format(outline[item].get('property:destinationAnchorName'), cgi.escape(outline[item].get('property:outlineTitle'), quote=True).encode('ascii', 'xmlcharrefreplace'),) html+="</ul></div>" a = spineapi.Annotation() a['concept'] = 'Collated' a['property:name'] = 'Outline' a['property:description'] = 'Document Structure' a['session:weight'] = '10000' a['property:html'] = html document.addAnnotation(a)
def on_ready_event(self, document): doi = utopia.tools.utils.metadata(document, 'identifiers[doi]') if doi is not None: try: # Check to see if the DOI is known url = 'http://api.altmetric.com/{0}/doi/{2}?key={1}'.format( self.api_version, self.key, doi) data = urllib2.urlopen(url, timeout=8).read() json.loads( data ) # Just check this is possible - throws exception otherwise a = spineapi.Annotation() a['concept'] = 'Altmetric' a['property:doi'] = doi a['property:json'] = data a['property:name'] = 'Altmetric' a['property:description'] = 'Who is talking about this article?' a['property:sourceDatabase'] = 'altmetric' a['property:sourceDescription'] = '<p>Discover, track and analyse online activity related to this article with <a href="http://www.altmetric.com/">Altmetric</a>.</p>' a['session:weight'] = '1' a['session:default'] = '1' document.addAnnotation(a) except (urllib2.URLError, socket.timeout): pass
def on_activate_event(self, document): if len(document.annotations('GPCRDB cache')) == 0: print 'annotating stuff . . .' pubmedId = utopialib.utils.metadata(document, 'identifiers[pubmed]') if pubmedId is not None: print 'found pubmed id: ' + pubmedId else: print 'did not find pubmed id' ns = {'r': 'GPCR'} textMentions = self.getMentions(document.text(), pubmedId) objectlist = [] mention_cache = {} for mention in textMentions: if mention.mentionType != 'SPECIES': mention_cache.setdefault(mention.html, []) mention_cache[mention.html].append(mention) for html, mentions in mention_cache.iteritems(): annotation = self.createAnnotation(document, html, mentions) annotation['displayRelevance'] = '2000' annotation['displayRank'] = '2000' document.addAnnotation(annotation) document.addAnnotation(spineapi.Annotation(), 'GPCRDB cache')
def on_load_event(self, document): # Email links for match in document.search(self.email, spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp): if not areas_intersect(match.areas(), self.existing_areas): annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = 'mailto:%s' % match.text() annotation['session:volatile'] = '1' annotation.addExtent(match) document.addAnnotation(annotation) else: print 'ignoring clashing email link text:', match.text().encode('utf8') # HTTP(S) links for match in document.search(self.http, spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp): if not areas_intersect(match.areas(), self.existing_areas): if match.begin().lineArea()[1] == 0: # Only while vertical links are rendered wrongly FIXME url = match.text() if not url.startswith('http'): url = 'http://' + url annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = '%s' % url annotation['session:volatile'] = '1' annotation.addExtent(match) document.addAnnotation(annotation) else: print 'ignoring clashing http link text:', match.text().encode('utf8')
def after_ready_event(self, document): outline={} for a in document.annotations(): if a.get('concept') == 'OutlineItem': outline[tuple([int(x) for x in a.get('property:outlinePosition').split('.')])]=a if len(outline): # html='<div><style media="screen" type="text/css">ul { list-style-type: none; }</style><ul>' html='<div><ul>' plen=1 for item in (sorted(outline.keys())): if len(item) > plen: html+='<ul><li>' elif len(item) < plen: html+='</li></ul></li><li>' else: html+='</li><li>' plen=len(item) html += '<a href="#" title="{0}" target="pdf; anchor={0}">{1}</a>'.format(outline[item].get('property:destinationAnchorName'), cgi.escape(outline[item].get('property:outlineTitle'), quote=True).encode('ascii', 'xmlcharrefreplace'),) html+="</ul></div>" a = spineapi.Annotation() a['concept'] = 'Collated' a['property:name'] = 'Outline' a['property:description'] = 'Document Structure' a['session:weight'] = '999' a['property:html'] = html document.addAnnotation(a)
def on_load_event(self, document): # Email links for match in document.search(self.email, spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp): if not areas_intersect(match.areas(), self.existing_areas): annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = 'mailto:%s' % match.text() annotation['session:volatile'] = '1' annotation.addExtent(match) document.addAnnotation(annotation) else: print('ignoring clashing email link text:', match.text().encode('utf8')) # HTTP(S) links for match in document.search(self.http, spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp): if not areas_intersect(match.areas(), self.existing_areas): if match.begin().lineArea()[1] == 0: # Only while vertical links are rendered wrongly FIXME url = match.text() if not url.startswith('http'): url = 'http://' + url annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = '%s' % url annotation['session:volatile'] = '1' annotation.addExtent(match) document.addAnnotation(annotation) else: print('ignoring clashing http link text:', match.text().encode('utf8'))
def add_success(component, method): error = spineapi.Annotation() error["concept"] = "Success" error["property:component"] = component error["property:method"] = method error["property:category"] = "success" document.addAnnotation(error, "errors.metadata")
def on_activate_event(self, document): if len(document.annotations('NucleaRDB cache')) == 0: print 'annotating stuff . . .' pubmedId = common.utils.metadata(document, 'pmid') if pubmedId is not None: print 'found pubmed id: ' + pubmedId else: print 'did not find pubmed id' ns = {'r': 'GPCR'} textMentions = self.getMentions(document.text(), pubmedId) objectlist = [] mention_cache = {} for mention in textMentions: if mention.mentionType != 'SPECIES': mention_cache.setdefault(mention.html, []) mention_cache[mention.html].append(mention) for html, mentions in mention_cache.iteritems(): annotation = self.createAnnotation(document, html, mentions) annotation['displayRelevance']='2000' annotation['displayRank']= '2000' document.addAnnotation(annotation) document.addAnnotation(spineapi.Annotation(), 'NucleaRDB cache')
def on_ready_event(self, document): # See if there is any CrossMark information available for this document # Firstly find the document's DOI doi = common.utils.metadata(document, 'doi') if doi is not None: # Then attempt to access CrossMark API try: url = 'http://crossmark.crossref.org/crossmark/?doi={0}'.format(doi) headers = {'Accept': 'application/json'} request = urllib2.Request(url, None, headers) cm = json.loads(urllib2.urlopen(request, timeout=8).read()) # Not found except urllib2.HTTPError as e: if e.code == 404: # just ignore 404 return raise # If successful, create an annotation to be visualised annotation = spineapi.Annotation() annotation['concept'] = 'CrossMarkNotice' annotation['property:doi'] = doi annotation['property:name'] = 'CrossMark' annotation['property:description'] = 'Information on updates, corrections and retractions' annotation['property:sourceDatabase'] = 'crossmark' annotation['property:sourceDescription'] = '<div><a href="http://www.crossref.org/crossmark/">CrossMark</a> gives scholars the information they need to verify that they are using the most recent and reliable versions of a document.</div>' document.addAnnotation(annotation)
def on_ready_event(self, document): # See if there is any CrossMark information available for this document # Firstly find the document's DOI doi = utopia.tools.utils.metadata(document, 'identifiers[doi]') if doi is not None: # Then attempt to access CrossMark API try: url = 'http://crossmark.crossref.org/crossmark/?doi={0}'.format( doi) headers = {'Accept': 'application/json'} request = urllib2.Request(url, None, headers) cm = json.loads(urllib2.urlopen(request, timeout=8).read()) # Not found except urllib2.HTTPError as e: if e.code == 404: # just ignore 404 return raise # If successful, create an annotation to be visualised annotation = spineapi.Annotation() annotation['concept'] = 'CrossMarkNotice' annotation['property:doi'] = doi annotation['property:name'] = 'CrossMark' annotation[ 'property:description'] = 'Information on updates, corrections and retractions' annotation['property:sourceDatabase'] = 'crossmark' annotation[ 'property:sourceDescription'] = '<div><a href="http://www.crossref.org/crossmark/">CrossMark</a> gives scholars the information they need to verify that they are using the most recent and reliable versions of a document.</div>' document.addAnnotation(annotation)
def after_load_event(self, document): ''' Provide formatted citations for the current document. This should be done quite late in the process, in case previous handlers have enriched the metadata. ''' # Start by getting all the best-trusted metadata for this document metadata = {} for key in self.properties: value = utopia.tools.utils.metadata(document, key) if value is not None: if key[-2:] == '[]': key = key[:-2] metadata[key] = value # Only if there's some metadata to display FIXME if len(metadata) > 0: # Now create a citation formatter annotation for the sidebar annotation = spineapi.Annotation() annotation['concept'] = 'CitationFormatter' annotation['property:json'] = json.dumps(metadata) annotation['property:name'] = 'Formatted Citation' annotation['property:description'] = 'How to cite this document' annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/csl.png', 'image/png') annotation['property:sourceDescription'] = ''' Uses <a href="https://bitbucket.org/fbennett/citeproc-js/wiki/Home">citeproc-js</a> to format the citation. ''' annotation['session:weight'] = '10' annotation['session:default'] = '1' annotation['session:volatile'] = '1' document.addAnnotation(annotation)
def on_ready_event(self, document): logger.debug('calling citeproc populate') doi = common.utils.metadata(document, 'doi') crossref_unixref = common.utils.metadata(document, 'raw_crossref_unixref') # Only bother for those documents that returned a crossref document if doi is not None and crossref_unixref is not None: #load styles and locales here stylesJson = urllib2.urlopen(self.stylesUrl, timeout=8).read() logger.debug(stylesJson) localesJson = urllib2.urlopen(self.localesUrl, timeout=8).read() logger.debug(localesJson) a = spineapi.Annotation() a['concept'] = 'CiteProc' a['property:doi'] = doi a['property:text'] = self.loadingMsg a['property:styles'] = stylesJson a['property:locales'] = localesJson a['property:name'] = 'CrossRef' a['property:description'] = 'Formatted citation for this article' a['property:sourceDatabase'] = 'crossref' a['property:sourceDescription'] = '<p><a href="http://www.crossref.org/">CrossRef</a> is the official DOI link registration agency for scholarly and professional publications.</p>' a['session:weight'] = '10' a['session:default'] = '1' document.addAnnotation(a)
def on_ready_event(self, document): # Place a link on the document to test the Javascript messaging functionality # self.postToBus('bioprodict', 'prepare') username = self.get_config("username") password = self.get_config("password") if self.validUsernameAndPassword(username, password): try: databases = self.getAvailableDatabases(username, password) databaseIds = [] databaseDescriptions = [] for database in databases: databaseIds.append(database["databaseId"]) databaseDescriptions.append(database["databaseDescription"]) annotation = Annotation() annotation["concept"] = "Bio3DMInformation" annotation["property:name"] = "Bio-Prodict 3DM" annotation["property:html"] = "html" annotation["property:description"] = """Annotate this document with one of your 3DM systems""" annotation["property:databaseIds"] = "|".join(databaseIds) annotation["property:databaseDescriptions"] = "|".join(databaseDescriptions) annotation["property:sourceDatabase"] = "bioprodict" annotation[ "property:sourceDescription" ] = '<p><a href="http://www.bio-prodict.nl">Bio-Prodict\'s</a> 3DM information systems provide protein family-specific annotations for this article</p>' # a.addExtent(document.substr(100, 300)) document.addAnnotation(annotation) except WebFault as detail: print "Exception:", detail
def on_ready_event(self, document): username = self.get_config('username') password = self.get_config('password') if self.validUsernameAndPassword(username, password): # Get a new bearer token basic = 'Basic dXRvcGlhLXBsdWdpbjo=' # base64.encodestring('utopia-plugin:').replace('\n', '') data = dict(username=username, password=password, grant_type='password') content = post_for_json(self.tokenurl, basic, data) self.bearer = 'Bearer ' + content['access_token'] self.proteinJs = self.proteinJs.replace('#TOKEN#', self.bearer) self.commonJs = self.commonJs.replace('#TOKEN#', self.bearer) # Get available databases for user databases = post_for_json(self.databasesurl, self.bearer) sorted_databases = sorted(databases.items(), key=lambda item: item[1]) databaseIds = [item[0] for item in sorted_databases] databaseDescriptions = [item[1] for item in sorted_databases] annotation = Annotation() annotation['concept'] = 'Bio3DMInformation' annotation['property:name'] = 'Bio-Prodict 3DM' annotation['property:html'] = 'html' annotation['session:overlay'] = 'hyperlink' annotation['session:color'] = '#336611' annotation['property:description'] = '''Annotate using one of your 3DM systems''' annotation['property:databaseIds'] = '|'.join(databaseIds) annotation['property:databaseDescriptions'] = '|'.join(databaseDescriptions) annotation['property:sourceDatabase'] = 'bioprodict' annotation['property:sourceDescription'] = '<p><a href="http://www.bio-prodict.nl">Bio-Prodict\'s</a> 3DM information systems provide protein family-specific annotations for this article</p>' document.addAnnotation(annotation)
def on_activate_event(self, document, data={}): action = data.get('action') domain = data.get('domain') if self.annotatedDomains is None: self.annotatedDomains = [] if action == 'annotate': print 'starting 3DM annotation . . .' pubmedId = utopia.tools.utils.metadata(document, 'identifiers[pubmed]') if pubmedId is None: pubmedId = '0' print 'sending text to remote server (' + pubmedId + '). . .' textMentions = self.getMentions(domain, document.text()) print 'received response, adding annotations for domain ' + domain + ' . . .' mention_cache = {} for mention in textMentions: if mention['mentionType'] != 'SPECIES' and mention['mentionType'] != 'PDB': html, css, js = self.buildHtml(domain, mention) mention['html'] = html.encode('utf-8') mention['css'] = css.encode('utf-8') mention['js'] = js.encode('utf-8') mention_cache.setdefault(mention['html'], []) mention_cache[mention['html']].append(mention) for html, mentions in mention_cache.iteritems(): annotation = self.createAnnotation(domain, document, html, mentions) annotation['displayRelevance'] = '2000' annotation['displayRank'] = '2000' document.addAnnotation(annotation) document.addAnnotation(Annotation(), domain) print 'done adding annotations.'
def on_ready_event(self, document): logger.debug('calling citeproc populate') doi = utopialib.utils.metadata(document, 'identifiers[doi]') crossref_unixref = utopialib.utils.metadata(document, 'raw_crossref_unixref') # Only bother for those documents that returned a crossref document if doi is not None and crossref_unixref is not None: #load styles and locales here stylesJson = urllib2.urlopen(self.stylesUrl, timeout=8).read() logger.debug(stylesJson) localesJson = urllib2.urlopen(self.localesUrl, timeout=8).read() logger.debug(localesJson) a = spineapi.Annotation() a['concept'] = 'CiteProc' a['property:doi'] = doi a['property:text'] = self.loadingMsg a['property:styles'] = stylesJson a['property:locales'] = localesJson a['property:name'] = 'CrossRef' a['property:description'] = 'Formatted citation for this article' a['property:sourceDatabase'] = 'crossref' a['property:sourceDescription'] = '<p><a href="http://www.crossref.org/">CrossRef</a> is the official DOI link registration agency for scholarly and professional publications.</p>' a['session:weight'] = '10' a['session:default'] = '1' document.addAnnotation(a)
def on_ready_event(self, document): # Find distinguishing ID pmid = common.utils.metadata(document, 'pmid') if pmid: print "Found pmid:", pmid for annotation in self.on_explore_event(phrase=pmid, document=document): annotation['property:description'] = 'Human genomic information related to this article' document.addAnnotation(annotation)
def on_ready_event(self, document): # Find distinguishing ID pmid = utopia.tools.utils.metadata(document, 'identifiers[pubmed]') if pmid: print "Found pmid:", pmid for annotation in self.on_explore_event(phrase=pmid, document=document): annotation[ 'property:description'] = 'Human genomic information related to this article' document.addAnnotation(annotation)
def on_filter_event(self, document, data = None): for annotation in document.annotations(): if annotation.get('concept') != 'DemoLogoOverlay' and annotation.get('property:demo_logo') == '1': annotation.removePropertyAll('property:demo_logo') overlay = spineapi.Annotation() overlay['concept'] = 'DemoLogoOverlay' overlay['property:demo_logo'] = '1' overlay.addExtents(annotation.extents()) overlay.addAreas(annotation.areas()) document.addAnnotation(overlay)
def on_persist_event(self, document): client = kend.client.Client() document_id = utopia.tools.utils.metadata(document, 'identifiers[utopia]') if document_id is not None: for annotation in document.annotations('PersistQueue'): if 'session:volatile' not in annotation: try: ka = kend.converter.Annotation.spineapi2kend( annotation, document_id) ka.context = self._context_ updated = client.persistAnnotation( ka, context=self._context_) if isinstance(updated, kend.model.Annotation): for key in ('id', 'created', 'author', 'revision', 'edit', 'media_edit'): annotation[key] = getattr(updated, key) annotation.removePropertyAll('session:media') for media in updated.media: mediaDict = {} for k in [ 'name', 'src', 'type', 'revision', 'size', 'edit' ]: if hasattr(media, k): mediaDict[k] = getattr(media, k) annotation.insertProperty( 'session:media', urllib.urlencode(mediaDict)) document.removeAnnotation(annotation, 'PersistQueue') document.addAnnotation(annotation) except: raise pass for annotation in document.annotations( document.deletedItemsScratchId()): try: if 'session:volatile' not in annotation: ka = kend.converter.Annotation.spineapi2kend( annotation, document_id) client.deleteAnnotation(ka) document.removeAnnotation(annotation, document.deletedItemsScratchId()) document.removeAnnotation(annotation) except: raise pass
def on_ready_event(self, document): # Get resolved DOI doi = common.utils.metadata(document, 'doi', '') # Only for PLOS DOIs should this plugin do anything if doi.startswith('10.1371/'): # Record the publisher identity information annotation = spineapi.Annotation() annotation['concept'] = 'PublisherIdentity' annotation['property:logo'] = utopia.get_plugin_data_as_url('images/large_logo.jpg', 'image/jpg') annotation['property:title'] = 'PLOS' annotation['property:webpageUrl'] = 'http://www.plos.org/' document.addAnnotation(annotation, 'PublisherMetadata') # Attempt to get ALMs from PLOS API url = 'http://alm.plos.org/articles/{0}.json?{{0}}'.format(doi) query = { 'api_key': self.api_key, 'events': '1', 'source': 'counter,pmc' } url = url.format(urllib.urlencode(query)) try: alm_events = json.loads(urllib2.urlopen(url, timeout=8).read()) # Not found except urllib2.HTTPError as e: if e.code == 404: # just ignore 404 return raise plos_pdf_views = 0 plos_html_views = 0 pmc_pdf_views = 0 pmc_html_views = 0 for source in alm_events.get('article', {}).get('source', []): if source.get('source') == 'Counter': events = source.get('events', []) plos_pdf_views, plos_html_views = reduce(lambda accum,event: (accum[0]+int(event.get('pdf_views', 0)),accum[1]+int(event.get('html_views', 0))), events, (0, 0)) elif source.get('source') == 'PubMed Central Usage Stats': events = source.get('events', []) pmc_pdf_views, pmc_html_views = reduce(lambda accum,event: (accum[0]+int(event.get('pdf', 0)),accum[1]+int(event.get('full-text', 0))), events, (0, 0)) annotation = spineapi.Annotation() annotation['concept'] = 'PLOSALMRecord' annotation['property:doi'] = doi annotation['property:name'] = 'PLOS' annotation['property:description'] = 'Download statistics' annotation['property:plos_pdf_views'] = plos_pdf_views annotation['property:plos_html_views'] = plos_html_views annotation['property:pmc_pdf_views'] = pmc_pdf_views annotation['property:pmc_html_views'] = pmc_html_views annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/small_logo.png', 'image/png') annotation['property:sourceDescription'] = '<p><a href="http://www.plos.org/">PLOS</a> article level metrics for downloads.</p>' document.addAnnotation(annotation)
def on_filter_event(self, document, data=None): for annotation in document.annotations(): if annotation.get( 'concept') != 'DemoLogoOverlay' and annotation.get( 'property:demo_logo') == '1': annotation.removePropertyAll('property:demo_logo') overlay = spineapi.Annotation() overlay['concept'] = 'DemoLogoOverlay' overlay['property:demo_logo'] = '1' overlay.addExtents(annotation.extents()) overlay.addAreas(annotation.areas()) document.addAnnotation(overlay)
def on_ready_event(self, document): pmid = common.utils.metadata(document, 'pmid') if pmid is not None: xhtml = '' params = { 'app_id': self.app_id, 'app_key': self.app_key, 'i': pmid, } url = '{0}/DocumentEntitiesService?{1}'.format( self.app_uri, urllib.urlencode(params)) response = urllib2.urlopen(url, timeout=15).read() results = json.loads(response.decode('latin1')) if results[ 'RESP_SYS_STATUS'] == 'STAT_OK' and 'RESP_PAYLOAD' in results and len( results['RESP_PAYLOAD']) > 0: xhtml += '<h2>Related entities</h2>' for entity in results['RESP_PAYLOAD']: xhtml += '<p><strong><a href="{0}">{1}</a></strong> ({2})</p>'.format( entity['bestLink'], entity['entityName'], entity['entityTypeDisplay']) params = { 'app_id': self.app_id, 'app_key': self.app_key, 'i': pmid, 'n': '10', } url = '{0}/DocumentToNewsService?{1}'.format( self.app_uri, urllib.urlencode(params)) response = urllib2.urlopen(url, timeout=15).read() results = json.loads(response.decode('latin1')) if results[ 'RESP_SYS_STATUS'] == 'STAT_OK' and 'RESP_PAYLOAD' in results and len( results['RESP_PAYLOAD']) > 0: xhtml += '<h2>Related news</h2>' for bite in results['RESP_PAYLOAD']: xhtml += self.renderBite(bite) if len(xhtml) > 0: a = spineapi.Annotation() a['concept'] = 'SciBite' a['property:pmid'] = pmid a['property:name'] = 'SciBite' a['property:sourceDatabase'] = 'scibite' a['property:xhtml'] = xhtml a['property:description'] = 'Biomedical News & Intelligence' a['property:sourceDescription'] = '<p><a href="http://scibite.com/">SciBite</a> scans 1000s of papers, patents, blogs, newsfeeds and more to bring you daily alerts on critical topics in biomedicine.</p>' document.addAnnotation(a)
def on_ready_event(self, document): doi = utopialib.utils.metadata(document, 'identifiers[doi]') # Find and aggregate AGI instances in the document matches_by_agi = {} for match in document.search(self.agiRegex, spineapi.RegExp + spineapi.WholeWordsOnly): agi = match.text() matches_by_agi.setdefault(agi, []) matches_by_agi[agi].append(match) # For each AGI add a new bit of HTML if len(matches_by_agi) > 0: for agi, matches in matches_by_agi.iteritems(): html = ''' <p style="overflow: auto; width: 100%"> <strong>{0}</strong> <span style="float: right">{1}</span> </p> ''' annotation = spineapi.Annotation() annotation['concept'] = 'AGI' annotation['property:agi'] = agi annotation['property:name'] = 'Plant gene databases' annotation[ 'property:description'] = 'American Society of Plant Biologists' annotation[ 'property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/aspb_logo.png', 'image/png') if doi is not None: if doi.startswith('10.1104/'): annotation[ 'property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/pp_logo.png', 'image/png') annotation[ 'property:description'] = 'From Plant Physiology' elif doi.startswith('10.1105/'): annotation[ 'property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/tpc_logo.png', 'image/png') annotation['property:description'] = 'From Plant Cell' annotation['property:sourceDescription'] = ''' <p> The <a href="http://www.aspb.org/">American Society of Plant Biologists</a> have deemed these linked databases important sources of information. </p> ''' annotation.addExtents(matches) document.addAnnotation(annotation)
def on_ready_event(self, document): issn = utopia.tools.utils.metadata(document, 'publication-issn') doi = utopia.tools.utils.metadata(document, 'identifiers[doi]') if issn is not None: params = {'versions': 'all', 'issn': issn, 'ak': self.apiKey} url = 'http://www.sherpa.ac.uk/romeo/api29.php?' + urllib.urlencode( params) srResponse = urllib2.urlopen(url, timeout=8) srData = srResponse.read() root = etree.fromstring(srData) #print etree.tostring(root, pretty_print=True, encoding='utf8') colour = root.find('publishers/publisher/romeocolour') if colour is not None: a = spineapi.Annotation() a['concept'] = 'SherpaRomeo' a['property:doi'] = doi a['property:name'] = 'Sherpa/RoMEO' a['property:sourceDatabase'] = 'sherparomeo' a['property:sourceDescription'] = '<p><a href="http://www.sherpa.ac.uk/romeo/">SHERPA/RoMEO</a> provides information about publisher copyright policies for this article.</p>' a['property:description'] = "Archiving status is '" + colour.text + "'." explanation = {} explanation[ 'green'] = "the author can archive pre-print <em>and</em> post-print or publisher's version/PDF" explanation[ 'blue'] = "the author can archive post-print (i.e. final draft post-refereeing) or publisher's version/PDF" explanation[ 'yellow'] = "the author can archive pre-print (i.e. pre-refereeing)" explanation[ 'white'] = "archiving of this article not formally supported" journalTitle = root.find('journals/journal/jtitle') publisherName = root.find('publishers/publisher/name') publisherURL = root.find('publishers/publisher/homeurl') xhtml = "<p>" xhtml = xhtml + 'This ' + journalTitle.text + ' article, published by <a href="' + publisherURL.text + '">' + publisherName.text + '</a>, is classified as being <a href="http://www.sherpa.ac.uk/romeo/definitions.php">RoMEO ' + colour.text + '</a>. ' xhtml = xhtml + 'This means that ' + explanation[ colour.text] + '.</p>' xhtml = xhtml + '<p>Other <a href="http://www.sherpa.ac.uk/romeo/issn/%s/">details and conditions</a> apply.</p>' % issn a['property:xhtml'] = xhtml document.addAnnotation(a)
def on_ready_event(self, document): issn = common.utils.metadata(document, 'issn') doi = common.utils.metadata(document, 'doi') if issn is not None: params = { 'versions': 'all', 'issn': issn, 'ak': self.apiKey } url = 'http://www.sherpa.ac.uk/romeo/api29.php?' + urllib.urlencode(params) srResponse = urllib2.urlopen(url, timeout=8) srData = srResponse.read() root = etree.fromstring(srData) #print etree.tostring(root, pretty_print=True, encoding='utf8') colour = root.find('publishers/publisher/romeocolour') if colour is not None: a = spineapi.Annotation() a['concept'] = 'SherpaRomeo' a['property:doi'] = doi a['property:name'] = 'Sherpa/RoMEO' a['property:sourceDatabase'] = 'sherparomeo' a['property:sourceDescription'] = '<p><a href="http://www.sherpa.ac.uk/romeo/">SHERPA/RoMEO</a> provides information about publisher copyright policies for this article.</p>' a['property:description'] = "Archiving status is '" +colour.text+ "'." explanation = {} explanation['green'] = "the author can archive pre-print <em>and</em> post-print or publisher's version/PDF" explanation['blue'] = "the author can archive post-print (i.e. final draft post-refereeing) or publisher's version/PDF" explanation['yellow'] = "the author can archive pre-print (i.e. pre-refereeing)" explanation['white'] = "archiving of this article not formally supported" journalTitle = root.find('journals/journal/jtitle') publisherName = root.find('publishers/publisher/name') publisherURL = root.find('publishers/publisher/homeurl') xhtml = "<p>" xhtml = xhtml + 'This '+ journalTitle.text + ' article, published by <a href="' + publisherURL.text +'">' + publisherName.text + '</a>, is classified as being <a href="http://www.sherpa.ac.uk/romeo/definitions.php">RoMEO ' + colour.text + '</a>. ' xhtml = xhtml + 'This means that ' + explanation[colour.text] + '.</p>' xhtml = xhtml + '<p>Other <a href="http://www.sherpa.ac.uk/romeo/issn/%s/">details and conditions</a> apply.</p>' % issn a['property:xhtml'] = xhtml document.addAnnotation(a)
def on_ready_event(self, document): pmid = common.utils.metadata(document, 'pmid') if pmid is not None: xhtml = '' params = { 'app_id': self.app_id, 'app_key': self.app_key, 'i': pmid, } url = '{0}/DocumentEntitiesService?{1}'.format(self.app_uri, urllib.urlencode(params)) response = urllib2.urlopen(url, timeout=15).read() results = json.loads(response.decode('latin1')) if results['RESP_SYS_STATUS'] == 'STAT_OK' and 'RESP_PAYLOAD' in results and len(results['RESP_PAYLOAD']) > 0: xhtml += '<h2>Related entities</h2>' for entity in results['RESP_PAYLOAD']: xhtml += '<p><strong><a href="{0}">{1}</a></strong> ({2})</p>'.format(entity['bestLink'], entity['entityName'], entity['entityTypeDisplay']) params = { 'app_id': self.app_id, 'app_key': self.app_key, 'i': pmid, 'n': '10', } url = '{0}/DocumentToNewsService?{1}'.format(self.app_uri, urllib.urlencode(params)) response = urllib2.urlopen(url, timeout=15).read() results = json.loads(response.decode('latin1')) if results['RESP_SYS_STATUS'] == 'STAT_OK' and 'RESP_PAYLOAD' in results and len(results['RESP_PAYLOAD']) > 0: xhtml += '<h2>Related news</h2>' for bite in results['RESP_PAYLOAD']: xhtml += self.renderBite(bite) if len(xhtml) > 0: a = spineapi.Annotation() a['concept'] = 'SciBite' a['property:pmid'] = pmid a['property:name'] = 'SciBite' a['property:sourceDatabase'] = 'scibite' a['property:xhtml'] = xhtml a['property:description'] = 'Biomedical News & Intelligence' a['property:sourceDescription'] = '<p><a href="http://scibite.com/">SciBite</a> scans 1000s of papers, patents, blogs, newsfeeds and more to bring you daily alerts on critical topics in biomedicine.</p>' document.addAnnotation(a)
def on_ready_event(self, document): doi = common.utils.metadata(document, 'doi', '') # Find and aggregate AGI instances in the document matches_by_agi = {} for match in document.search(self.agiRegex, spineapi.RegExp + spineapi.WholeWordsOnly): agi = match.text() matches_by_agi.setdefault(agi, []) matches_by_agi[agi].append(match) # For each AGI add a new bit of HTML if len(matches_by_agi) > 0: for agi, matches in matches_by_agi.iteritems(): html = ''' <p style="overflow: auto; width: 100%"> <strong>{0}</strong> <span style="float: right">{1}</span> </p> ''' annotation = spineapi.Annotation() annotation['concept'] = 'AGI' annotation['property:agi'] = agi annotation['property:name'] = 'Plant gene databases' annotation['property:description'] = 'American Society of Plant Biologists' annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/aspb_logo.png', 'image/png') if doi.startswith('10.1104/'): annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/pp_logo.png', 'image/png') annotation['property:description'] = 'From Plant Physiology' elif doi.startswith('10.1105/'): annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/tpc_logo.png', 'image/png') annotation['property:description'] = 'From Plant Cell' annotation['property:sourceDescription'] = ''' <p> The <a href="http://www.aspb.org/">American Society of Plant Biologists</a> have deemed these linked databases important sources of information. </p> ''' annotation.addExtents(matches) document.addAnnotation(annotation)
def on_ready_event(self, document): doi = common.utils.metadata(document, 'doi') if doi is not None: try: # Check to see if the DOI is known url = 'http://api.altmetric.com/{0}/doi/{2}?key={1}'.format(self.api_version, self.key, doi) data = urllib2.urlopen(url, timeout=8).read() json.loads(data) # Just check this is possible - throws exception otherwise a = spineapi.Annotation() a['concept'] = 'Altmetric' a['property:doi'] = doi a['property:json'] = data a['property:name'] = 'Altmetric' a['property:description'] = 'Who is talking about this article?' a['property:sourceDatabase'] = 'altmetric' a['property:sourceDescription'] = '<p>Discover, track and analyse online activity related to this article with <a href="http://www.altmetric.com/">Altmetric</a>.</p>' a['session:weight'] = '1' a['session:default'] = '1' document.addAnnotation(a) except (urllib2.URLError, socket.timeout): pass
def add_error(component, method, category=None, message=None, exception=None): if exception is not None: if isinstance(exception, urllib2.URLError) and isinstance(exception.reason, socket.timeout): exception = exception.reason if isinstance(exception, socket.timeout): category = "timeout" message = "The server did not respond" elif isinstance(exception, urllib2.HTTPError): category = "server" message = unicode(getattr(exception, "reason", "The server did not respond as expected")) elif isinstance(exception, urllib2.URLError): category = "connection" message = unicode(getattr(exception, "reason", "The server could not be found")) error = spineapi.Annotation() error["concept"] = "Error" error["property:component"] = component error["property:method"] = method error["property:category"] = category if message is not None: error["property:message"] = message document.addAnnotation(error, "errors.metadata")
def on_ready_event(self, document): doi = common.utils.metadata(document, 'doi') if doi is None: return None collection = self._get_collection_by_doi(doi) if not collection: return None image_ids = self._get_collection_images(collection) htmls = [ """<p><a href="{}" title="View collection on NeuroVault">View collection on NeuroVault</a></p>""" .format(self.COLLECTIONS_URL % collection['id']) ] for image_id in image_ids: info = self._get_image_metainfo(image_id) if not info: continue info['url'] = self.IMAGES_URL % image_id info['image_id'] = image_id html = u''' <div id="{image_id}" class="box"> <p> <span class="name">{name}</span> / <span class="map_type">{map_type}</span> / <span class="title"><a href="{url}" title="View in NeuroVault">{description}</a></span> </p> </div>'''.format(**info) htmls.append(html) if len(htmls) > 0: annotation = spineapi.Annotation() annotation['concept'] = 'NeuroVaultReference' annotation['property:html'] = ''.join(htmls) annotation['property:name'] = 'NeuroVault' annotation[ 'property:description'] = 'Publicly available supplementary data' annotation['property:sourceDatabase'] = 'neurovault' annotation[ 'property:sourceDescription'] = '<p><a href="http://neurovault.org/">Neuro<strong>Vault</strong></a> allows neuroimaging researchers to publish their full resultant statistical maps to supplement their publications.</p>' document.addAnnotation(annotation)
def on_persist_event(self, document): client = kend.client.Client() document_id, doi = self._resolve(document) if document_id is not None: for annotation in document.annotations('PersistQueue'): if 'session:volatile' not in annotation: try: ka = kend.converter.Annotation.spineapi2kend(annotation, document_id) ka.context = self._context_ updated = client.persistAnnotation(ka, context = self._context_) if isinstance(updated, kend.model.Annotation): for key in ('id', 'created', 'author', 'revision', 'edit', 'media_edit'): annotation[key] = getattr(updated, key) annotation.removePropertyAll('session:media') for media in updated.media: mediaDict = {} for k in ['name', 'src', 'type', 'revision', 'size', 'edit']: if hasattr(media, k): mediaDict[k] = getattr(media, k) annotation.insertProperty('session:media', urllib.urlencode(mediaDict)) document.removeAnnotation(annotation, 'PersistQueue') document.addAnnotation(annotation) except: raise pass for annotation in document.annotations(document.deletedItemsScratchId()): try: if 'session:volatile' not in annotation: ka = kend.converter.Annotation.spineapi2kend(annotation, document_id) client.deleteAnnotation(ka) document.removeAnnotation(annotation, document.deletedItemsScratchId()) except: raise pass
def on_activate_event(self, document, data={}): action = data.get("action") domain = data.get("domain") if self.annotatedDomains == None: self.annotatedDomains = [] if action == "annotate": print "starting 3DM anntotation . . ." ns = {"r": "GPCR"} pubmedId = common.utils.metadata(document, "pmid") if pubmedId == None: pubmedId = "0" print "sending text to remote server (" + pubmedId + "). . ." textMentions = self.getMentions(domain, document.text(), pubmedId) print "recieved response, adding annotations for domain " + domain + " . . ." objectlist = [] mention_cache = {} for mention in textMentions: if mention.mentionType != "SPECIES" and mention.mentionType != "PDB": newData = self.rewriteData(mention) mention.data = newData html, css, js = self.buildHtml(domain, mention) mention.html = html.encode("utf-8") mention.css = css.encode("utf-8") mention.js = js.encode("utf-8") mention_cache.setdefault(mention.html, []) mention_cache[mention.html].append(mention) for html, mentions in mention_cache.iteritems(): annotation = self.createAnnotation(domain, document, html, mentions) annotation["displayRelevance"] = "2000" annotation["displayRank"] = "2000" document.addAnnotation(annotation) document.addAnnotation(Annotation(), domain) print "done adding annotations."
def on_ready_event(self, document): # Scrape title and DOI from document title = utopia.tools.utils.metadata(document, 'title') doi = utopia.tools.utils.metadata(document, 'identifiers[doi]') if title is not None or doi is not None: # Make metadata link link = spineapi.Annotation() link['session:volatile'] = '1' link['concept'] = 'MetadataSource' link['rank'] = '1000' link['source'] = 'Content' link['listName'] = 'ContentMetadata' document.addAnnotation(link) # Store actual metadata annotation = spineapi.Annotation() annotation['session:volatile'] = '1' annotation['concept'] = 'DocumentMetadata' annotation['property:source'] = 'Content' if title is not None: annotation['property:title'] = title if doi is not None: annotation['property:doi'] = doi document.addAnnotation(annotation, link['listName'])
def on_ready_event(self, document): doi = common.utils.metadata(document, 'doi') if doi is None: return None collection = self._get_collection_by_doi(doi) if not collection: return None image_ids = self._get_collection_images(collection) htmls = [ """<p><a href="{}" title="View collection on NeuroVault">View collection on NeuroVault</a></p>""".format(self.COLLECTIONS_URL % collection['id']) ] for image_id in image_ids: info = self._get_image_metainfo(image_id) if not info: continue info['url'] = self.IMAGES_URL % image_id info['image_id'] = image_id html = u''' <div id="{image_id}" class="box"> <p> <span class="name">{name}</span> / <span class="map_type">{map_type}</span> / <span class="title"><a href="{url}" title="View in NeuroVault">{description}</a></span> </p> </div>'''.format(**info) htmls.append(html) if len(htmls) > 0: annotation = spineapi.Annotation() annotation['concept'] = 'NeuroVaultReference' annotation['property:html'] = ''.join(htmls) annotation['property:name'] = 'NeuroVault' annotation['property:description'] = 'Publicly available supplementary data' annotation['property:sourceDatabase'] = 'neurovault' annotation['property:sourceDescription'] = '<p><a href="http://neurovault.org/">Neuro<strong>Vault</strong></a> allows neuroimaging researchers to publish their full resultant statistical maps to supplement their publications.</p>' document.addAnnotation(annotation)
def on_ready_event(self, document): document_id = utopia.tools.utils.metadata(document, 'identifiers[utopia]') if document_id is not None: kwargs = {'document': document_id, 'context': self._context_} doi = utopia.tools.utils.metadata(document, 'identifiers[doi]') if doi is not None: kwargs['doi'] = doi annotations = kend.client.Client().annotations(**kwargs) link = None if annotations is not None: for group in annotations: for ann in group.annotations: a = kend.converter.Annotation.kend2spineapi( ann, document) if a.get( 'author' ) == 'http://utopia.cs.manchester.ac.uk/users/11679': if a.get('concept') in ("DocumentMetadata", "AuthorAffiliation", "DocumentReference"): if link is None: link = document.newAccList('metadata', 100) link[ 'property:sourceDatabase'] = 'biochemj' link[ 'property:sourceTitle'] = 'The Semantic Biochemical Journal' link[ 'property:sourceDescription'] = '<p>Made available by <a href="http://www.portlandpress.com/">Portland Press Limited</a> as part of the <a href="http://www.biochemj.org/bj/semantic_faq.htm">Semantic Biochemical Journal</a>.' # Modify Bibliography Entries if a.get('concept') == 'Citation': for keyTo, keyFrom in { 'property:title': 'property:articleTitle', 'property:authors': 'property:articleAuthors', 'property:year': 'property:articleYear', 'property:volume': 'property:articleVolume', 'property:source': 'property:journalTitle', }.iteritems(): if keyFrom in a: a[keyTo] = a[keyFrom] a['property:sourceDatabase'] = 'biochemj' a['property:sourceDescription'] = 'Thingy' document.addAnnotation(a, link['scratch']) else: document.addAnnotation(a) else: document.addAnnotation(a)
def on_ready_event(self, document): document_id, doi = self._resolve(document) if document_id is not None: kwargs = { 'document': document_id, 'context': self._context_ } if doi is not None: kwargs['doi'] = doi annotations = kend.client.Client().annotations(**kwargs) link = None if annotations is not None: for group in annotations: for ann in group.annotations: a = kend.converter.Annotation.kend2spineapi(ann, document) if a.get('author') == 'http://utopia.cs.manchester.ac.uk/users/11679': if a.get('concept') in ("DocumentMetadata", "AuthorAffiliation", "DocumentReference"): if link is None: link = document.newAccList('metadata', 100) link['property:sourceDatabase'] = 'biochemj' link['property:sourceTitle'] = 'The Semantic Biochemical Journal' link['property:sourceDescription'] = '<p>Made available by <a href="http://www.portlandpress.com/">Portland Press Limited</a> as part of the <a href="http://www.biochemj.org/bj/semantic_faq.htm">Semantic Biochemical Journal</a>.' # Modify Bibliography Entries if a.get('concept') == 'DocumentReference': for keyTo, keyFrom in { 'property:title': 'property:articleTitle', 'property:authors': 'property:articleAuthors', 'property:year': 'property:articleYear', 'property:volume': 'property:articleVolume', 'property:source': 'property:journalTitle', }.iteritems(): if keyFrom in a: a[keyTo] = a[keyFrom] a['property:sourceDatabase'] = 'biochemj' a['property:sourceDescription'] = 'Thingy' document.addAnnotation(a, link['scratch']) else: document.addAnnotation(a) else: document.addAnnotation(a)
def on_ready_event(self, document): doi = common.utils.metadata(document, 'doi') if doi is not None: page = 1 items_retrieved = 0 api_search_url = 'http://api.figshare.com/v1/articles/search?' query = {'search_for': doi, 'has_link': doi} htmls = [] while True: query['page'] = page #handler = oauth_auth.HTTPOauthAuthHandler() #consumer = oauth.KeySecret('yeiB61W0PYaUGPhhi8pBhA', 'LTOVWR94y8YZwscJhrFg0w') #token = oauth.KeySecret('jsWfvZBLPgNMRMjFaQOMbgAst4Rh5LzWmTMDD4HkHOpAjsWfvZXLPgNMRMjFaQOMbg', 'm8iqNc7AQH9Yrqa6e0H5AA') #handler.add_password(None, 'api.figshare.com', consumer, token) #opener = urllib2.build_opener(handler) #print api_search_url + urllib.urlencode(query) #response = opener.open(api_search_url + urllib.urlencode(query)).read() response = urllib2.urlopen(api_search_url + urllib.urlencode(query), timeout=8).read() data = json.loads(response) items = data.get('items', []) items_found = int(data.get('items_found', 0)) # Bail if no items found if len(items) == 0 or items_found <= 0: break # Bail after ten pages of stuff if page > 10: break items_retrieved += len(items) for item in items: title = flatten(item.get('title')) description = flatten(item.get('description')) links = item.get('links', []) url = item.get('url') item_doi = item.get('DOI') article_id = item.get('article_id') authors = item.get('authors', []) published_date = item.get('published_date') type = item.get('type') html = u''' <div id="{article_id}" class="box"> <p> <span class="title">{title}</span> <span class="authors">{authors}</span> <a href="{url}" title="Explore FigShare">[Link]</a> </p> <p class="readmore"> {description} </p> </div> '''.format(**{ 'article_id': article_id, 'title': title, 'url': item_doi, 'description': description, 'authors': u', '.join((author['author_name'] for author in authors)) }) htmls.append(html) # Stop if we've retrieved the number of items expected if items_retrieved >= items_found: break page += 1 if len(htmls) > 0: annotation = spineapi.Annotation() annotation['concept'] = 'FigShareReference' annotation['property:html'] = ''.join(htmls) annotation['property:name'] = 'FigShare' annotation['property:description'] = 'Publicly available supplementary material' annotation['property:sourceDatabase'] = 'figshare' annotation['property:sourceDescription'] = '<p><a href="http://figshare.com/">fig<strong>share</strong></a> allows researchers to publish all of their research outputs in seconds in an easily citable, sharable and discoverable manner.</p>' document.addAnnotation(annotation)
def on_ready_event(self, document): doi = utopialib.utils.metadata(document, 'identifiers[doi]') if doi is not None: info = {} # Resolve the DOI to find the publisher's website response = urllib2.urlopen('http://dx.doi.org/{0}'.format(doi), timeout=8) # Parse page to find (if there) the full text URL parser = etree.HTMLParser() html = etree.parse(response, parser) # Only continue if this is a highwire HTML page if len(html.xpath("/html/head/meta[@name='HW.identifier']")) == 0: return # Now make sure we have the full text XHTML citation_fulltext_html_url = html.xpath( "/html/head/meta[@name='citation_fulltext_html_url']/@content") if len(citation_fulltext_html_url) > 0: citation_fulltext_html_url = citation_fulltext_html_url[0] # Fetch that full text page (if different to the current one) if citation_fulltext_html_url != response.geturl(): response = urllib2.urlopen(citation_fulltext_html_url, timeout=8) html = etree.parse(response, parser) #print etree.tostring(html, pretty_print=True, encoding='utf8') # Now parse out the bibliography info['citations'] = [] info['citations_by_id'] = {} for bibitem in html.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' ref-list ')]//ol[contains(concat(' ', normalize-space(@class), ' '), ' cit-list ')]/li" ): citation = query( bibitem, { 'id': 'a/@id', 'label': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' ref-label ')]/text()", 'title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-article-title ')]/text()", 'year': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-date ')]/text()", 'publication-title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-jnl-abbrev ')]/text()", 'volume': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-vol ')]/text()", 'issue': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-issue ')]/text()", 'pagefrom': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-fpage ')]/text()", 'pageto': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-lpage ')]/text()", 'pmid': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-pmid ')]/text()", 'doi': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-doi ')]/text()", 'etree': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-metadata ')]", }) authors = [] for a in bibitem.xpath( ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-auth ')]" ): surname = a.xpath( ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-surname ')]/text()" ) given_names = a.xpath( ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-given-names ')]/text()" ) if len(surname) > 0 and len(given_names) > 0: authors.append(u'{0}, {1}'.format( surname[0], given_names[0]).strip(', ')) if len(authors) > 0: citation['authors'] = authors citation['contexts'] = [] citation['displayText'] = utopia.citation.format(citation) info['citations'].append(citation) info['citations_by_id'][citation['id']] = citation #print citation ####################################################################################### # Parse in-text citations if present min_length = 10 max_length = 20 for paragraph in html.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' article ')]//p" ): text_stack = [paragraph.text or ''] xref_stack = [None] for elem in paragraph: if len(elem.xpath('self::a[@class="xref-bibr"]')) > 0: text_stack.append( etree.tostring(elem, method='text', encoding=unicode, with_tail=False)) text_stack.append(elem.tail or '') xref = info['citations_by_id'].get( elem.get('href', '')[1:]) if xref is not None: xref_stack += [[xref], None] else: xref_stack += [[], None] elif isinstance(elem, etree._Entity): points = entities.get(elem.text[1:-1]) if points is not None: text_stack[-1] += ''.join( (unichr(p) for p in points)) else: text_stack[-1] += etree.tostring( elem, encoding=unicode) else: if elem.get('position') == 'float': text_stack[-1] += elem.tail or '' else: text_stack[-1] += etree.tostring( elem, method='text', encoding=unicode) # Find and collapse ranges in the text for i in xrange(len(xref_stack) - 3, 1, -2): text = text_stack[i].strip() #print len(text), (text in u'-\u2010\u2011\u2012\u2013\u2014\u2015'), u''.join(text_stack[i-1:i+2]).encode('utf8') # if this text is a dash, we need to coalesce the text fragments if len( text ) == 1 and text in u'-\u2010\u2011\u2012\u2013\u2014\u2015': text_stack[i - 1:i + 2] = [ u''.join(text_stack[i - 1:i + 2]) ] xref_stack[i - 1:i + 2] = [ xref_stack[i - 1] + xref_stack[i + 1] ] #for text in text_stack: # print text.encode('utf8') # Then make sure we resolve the implied citations for i in xrange(1, len(xref_stack), 2): # Get actual cross references xrefs = xref_stack[i] # Expand cross references try: if len(xrefs) == 2: labelfrom = int(xrefs[0].get('label')) labelto = int(xrefs[1].get('label')) candidates = {} midlabels = [ unicode(midlabel) for midlabel in xrange( labelfrom + 1, labelto) ] for candidate in info['citations']: if candidate.get('label') in midlabels: candidates[int(candidate.get( 'label'))] = candidate xrefs[1:-1] = candidates.values() except: raise # Find and collapse lists in the text for i in xrange(len(xref_stack) - 3, 1, -2): text = text_stack[i].strip() # if this text is a comma, we need to coalesce the text fragments if len(text) == 1 and text == ',': text_stack[i - 1:i + 2] = [ u''.join(text_stack[i - 1:i + 2]) ] xref_stack[i - 1:i + 2] = [ xref_stack[i - 1] + xref_stack[i + 1] ] # Expand citations to include brackets (on both sides) for i in xrange(len(xref_stack) - 2, 0, -2): before = text_stack[i - 1].strip()[-1:] text = text_stack[i].strip() after = text_stack[i + 1].strip()[:1] # if this text is a comma, we need to coalesce the text fragments #print before.encode('utf'), after.encode('utf') if len(before) > 0 and before in '({[' and len( after) > 0 and after in ')}]': text_stack[i - 1] = re.sub(r'[({[](\s*)$', r'\1', text_stack[i - 1]) text_stack[i + 1] = re.sub(r'^(\s*)[)}\]]', r'\1', text_stack[i + 1]) text_stack[i] = before + text_stack[i] + after #print repr(text_stack) for i in xrange(1, len(xref_stack), 2): # Get context before = u' '.join(text_stack[:i]).strip() label = text_stack[i].strip() after = u' '.join(text_stack[i + 1:]).strip() # Strip out extraneous brackets if len( xref_stack[i] ) > 1: # Hack to differentiate single / multiple citations # as multiple numbers tend not to have spaces between them label = re.sub( ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,])\s?[({[]?', r'\1', label) else: label = re.sub( ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,]\s?)[({[]?', r'\1', label) # Normalise context before = re.sub(r'\s+', ' ', before)[-max_length:].strip() label = re.sub(r'\s+', ' ', label) after = re.sub(r'\s+', ' ', after)[:max_length].strip() #print (before.encode('utf8'), label.encode('utf8'), after.encode('utf8')) if len(before + after) > min_length: for xref in xref_stack[i]: xref['contexts'].append((before, label, after)) #print xref_stack[i] ####################################################################################### # Parse tables if present info['tables'] = {} for table_url in html.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' table-inline ')]/div[contains(concat(' ', normalize-space(@class), ' '), ' callout ')]//li[1]/a/@href" ): table_url = urlparse.urljoin(citation_fulltext_html_url, table_url) #print table_url response = urllib2.urlopen(table_url, timeout=8) table_html = etree.parse(response, parser) for table_expansion in table_html.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' table-expansion ')]" ): id = table_expansion.get('id') table = {} table['xml'] = table_expansion.xpath('.//table[1]')[0] table['caption_raw'] = table_expansion.xpath( ".//span[contains(concat(' ', normalize-space(@class), ' '), ' caption-title ')][1]" )[0] if 'caption' not in table and 'caption_raw' in table: table['caption'] = table['caption_raw'] if 'caption' in table: table['caption'] = re.sub( r'\s+', ' ', etree.tostring(table['caption'], method='text', encoding=unicode).strip()) if 'xml' in table: table['xml'] = etree.tostring(table['xml'], encoding='utf8') info['tables'][id] = table #print table #print info if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Generate sensible titles / descriptions / icons? journalTitle = info.get('publication-title', '') journalTitleSuffix = '' publisher = info.get('publisher', 'the publisher') if len(journalTitle) > 0: journalTitleSuffix = ' ({0})'.format(journalTitle) # Create Metadata link annotation link = document.newAccList('metadata', 90) link['property:sourceIcon'] = '' link['property:sourceTitle'] = publisher link['property:sourceDescription'] = ''' <p>This information was provided by {0}{1}.</p> '''.format(publisher, journalTitleSuffix) # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation( citation) document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): #print (pre, label, post) matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format( citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) #print regex # convert oasis tables ns = { 'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table' } xml = etree.fromstring(table['xml']) if xml.tag == '{{{0}}}table'.format(ns['oasis']): for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns): columns = {} for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns): columns[colspec.get('colname')] = int( colspec.get('colnum')) for section in tgroup.xpath( './oasis:thead|./oasis:tbody', namespaces=ns): isHead = ( section.tag == '{{{0}}}thead'.format( ns['oasis'])) for row in section.xpath('./oasis:row', namespaces=ns): for entry in row.xpath('./oasis:entry', namespaces=ns): colname = entry.get('colname') colst = entry.get('namest') colend = entry.get('nameend') if colst is not None and colend is not None: colspan = columns[ colend] - columns[colst] + 1 else: colspan = 1 if colspan > 1: entry.set( 'colspan', unicode(colspan)) morerows = entry.get('morerows') if morerows is not None: rowspan = int(morerows) + 1 else: rowspan = 1 if rowspan > 1: entry.set( 'rowspan', unicode(rowspan)) entry.tag = 'td' row.tag = 'tr' if isHead: section.tag = 'thead' else: section.tag = 'tbody' xml.append(section) xml.tag = 'table' #print etree.tostring(xml, pretty_print=True, encoding='utf8') table['xml'] = etree.tostring(xml, encoding='utf8') matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def after_load_event(self, document): # Put errors together in a sensible way errors = {} failures = 0 successes = 0 for error in document.annotations('errors.metadata'): if error.get('concept') == 'Success': successes += 1 elif error.get('concept') == 'Error': failures += 1 component = error.get('property:component') errors.setdefault(component, {}) category = error.get('property:category') errors[component].setdefault(category, []) method = error.get('property:method') message = error.get('property:message', '') errors[component][category].append((method, message)) categories = {} for component, details in errors.iteritems(): for category in details.keys(): categories.setdefault(category, 0) categories[category] += 1 # If there are errors, provide feedback to the user if failures > 0: # Check for likely client problems if categories.get('connection', 0) == failures and successes == 0: summary = ''' Utopia could not reach any of the online services it would normally use to identify this document, meaning you are likely to see limited or no information below. You might wish to check your Internet connection and reload the document. ''' elif categories.get('timeout', 0) > 1: if categories.get('timeout', 0) == failures and successes == 0: many = '' else: many = 'some of' summary = ''' Utopia gave up contacting {0} the online services it would normally use to identify this document because they were taking too long to respond. You are likely to see limited or no information below. You might wish to check your Internet connection and reload the document. '''.format(many) else: if failures == 1: noun = 'An error' else: noun = 'Errors' summary = ''' {0} occurred when trying to discover the identity of this document. You are likely to see limited or no information below. '''.format(noun) html = ''' <div class="box error"> <strong>Warning</strong> <p> {0} </p> <div class="expandable" title="Details..."> <ul> '''.format(summary) for component, details in errors.iteritems(): for category, methods in details.iteritems(): if category != 'success': summary = { 'timeout': '{0} did not respond', 'connection': 'Could not connect to {0}', 'server': '{0} behaved unexpectedly', }.get(category, 'An error occurred accessing {0}') methods_html = ', '.join( ('<span title="{1}">{0}</span>'.format( method, message) for method, message in methods)) html += '<li>{0} (when accessing: {1}).</li>'.format( summary.format('<strong>' + component + '</strong>'), methods_html) html += ''' </ul> </div> <div> ''' annotation = spineapi.Annotation() annotation['concept'] = 'Collated' annotation['property:html'] = html annotation['property:name'] = 'Error' annotation['session:weight'] = '1000' annotation['session:default'] = '1' annotation['session:headless'] = '1' document.addAnnotation(annotation) print errors
def _populate(self, document): # Start by seeing what is already known about this document nlm = common.utils.metadata(document, 'raw_pmc_nlm') doi = common.utils.metadata(document, 'doi') if nlm is not None: info = self.JournalXMLParser(nlm) try: nlmdoi = info.articleDOI().lower() except: # FIXME which exception(s)? nlmdoi = None print "PMC returned nothing" if doi != nlmdoi: print "PMC returned wrong article:", info.articleDOI() else: print "PMC returned information about article:", info.articleTitle() link = document.newAccList('metadata') link['property:sourceDatabase'] = 'pubmed' link['property:sourceTitle'] = 'PubMed' link['property:sourceDescription'] = '<p><a href="http://www.ncbi.nlm.nih.gov/pubmed/">PubMed</a> comprises more than 21 million citations for biomedical literature from MEDLINE, life science journals, and online books.</p>' annotation = spineapi.Annotation() annotation['concept'] = 'DocumentMetadata' # print nlm.articlePublicationDate('epub') # print nlm.articlePublicationDate('epreprint') # print nlm.journalISSN('epub') annotation["property:identifier"] = 'info:doi%s' % info.articleDOI() annotation["property:source"] = 'Publisher/NLM' annotation["property:curatedBy"] = "PMC" annotation["property:journalTitle"] = info.journalTitle() annotation["property:journalPublisher"] = info.journalPublisher() annotation["property:journalISSN"] = info.journalISSN() annotation["property:articleAuthors"] = info.articleAuthors() annotation["property:articleTitle"] = info.articleTitle() annotation["property:articleDOI"] = info.articleDOI() annotation["property:articlePMID"] = info.articlePMID() annotation["property:articlePublisherID"] = info.articlePublisherID() annotation["property:articlePublicationDate"] = info.articlePublicationDate() annotation["property:articleVolume"] = info.articleVolume() annotation["property:articleIssue"] = info.articleIssue() if info.articlePages() is not None: annotation["property:articlePages"] = "%s-%s" % info.articlePages() annotation["property:articleAbstract"] = info.articleAbstract() annotation["property:articleKeywords"] = info.articleKeywords() annotation["property:articleAbbreviations"] = info.articleAbbreviations() document.addAnnotation(annotation, link['scratch']) # FIXME: Annotation properties need to be lists for surname, forename, aff in info.articleAuthorAffiliationList(): annotation = spineapi.Annotation() annotation['concept'] = "AuthorAffiliation" annotation["property:curatedBy"] = "PMC" annotation["property:authorSurname"] = surname annotation["property:authorForename"] = forename annotation["property:articleAuthor"] = "%s, %s" %(surname, forename) annotation["property:affiliation"] = aff document.addAnnotation(annotation, link['scratch']) for ref in info.articleReferenceList(): annotation = spineapi.Annotation() annotation['concept'] = "DocumentReference" if 'doi' in ref: annotation["property:doi"] = ref['doi'] if 'pmid' in ref: annotation["property:pmid"] = ref['pmid'] if 'title' in ref: annotation["property:title"] = ref['title'] if 'label' in ref: annotation["property:label"] = ref['label'] if 'authors' in ref: annotation["property:authors"] = ref['authors'] if 'editors' in ref: annotation["property:articleEditors"] = ref['editors'] if 'publication-title' in ref: annotation["property:publication-title"] = ref['publication-title'] if 'type' in ref: annotation["property:publicationType"] = ref['type'] if 'volume' in ref: annotation["property:volume"] = ref['volume'] if 'issue' in ref: annotation["property:issue"] = ref['issue'] if 'publisher' in ref: annotation["property:publisher"] = ref['publisher'] if 'fpage' in ref and 'lpage' in ref: annotation["property:pages"] = "%s-%s" % (ref['fpage'],ref['lpage']) if 'year' in ref: annotation["property:year"] = ref['year'] document.addAnnotation(annotation, link['scratch'])
def on_ready_event(self, document): # Find distinguishing ID pmid = common.utils.metadata(document, 'pmid') # Compile distinct GEO IDs in the text matches = {} for match in document.search(r'GSE\d+', spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp): matches.setdefault(match.text(), []) matches[match.text()].append(match) # Create annotations for each GEO ID for gse, extents in matches.iteritems(): entry = self._fetchGEO(gse) dates = u'Submitted {0}'.format(entry['submission_date']) if 'last_update_date' in entry: dates += u'; last updated {0}'.format(entry['last_update_date']) dates += '.' dataCitation=u'''<p>{0}. <strong>{1}</strong>.</p><p>{2}<br>({3})</p><p>{4}</p>'''.format( entry['contributors'].decode('utf8'), entry['title'].decode('utf8'), entry['overall_design'].decode('utf8'), entry['type'].decode('utf8'), dates) xhtml = u'<div class="box">{0}{{0}}<p>GEO Accession: <a href="{1}">{2}</a></p></div>'.format( dataCitation, entry['GEO_url'].decode('utf8'), gse) xhtml += u'<p><a href="{0}">Explore in InSilico DB...</a></p>'.format(entry['InSilicoDB_url']) srcdesc='''<p>The <a href="http://www.ncbi.nlm.nih.gov/geo">Gene Expression Omnibus (GEO)</a> is a public repository that archives and freely distributes microarray, next-generation sequencing, and other forms of high-throughput functional genomic data submitted by the scientific community.</p>''' if entry.get('pubmed_id') == pmid: # add a global annotation annotation = spineapi.Annotation() annotation['concept'] = 'GEO' annotation['property:name'] = 'Gene Expression Omnibus' annotation['property:sourceDatabase'] = 'geo' annotation['property:description'] = '{0} (Data associated with this article)'.format(gse) annotation['property:sourceDescription'] = srcdesc annotation['property:xhtml'] = xhtml.format('') # Keep summary blank document.addAnnotation(annotation) # Generate summary summary = entry.get('summary', '') if len(summary) > 0: summary_words = summary.split(' ') summary = u'<p><em>Summary:</em> ' summary += u'{0}'.format(' '.join(summary_words[:32])) if len(summary_words) > 32: summary += u' <span class="readmore">{0}</span>'.format(' '.join(summary_words[32:])) summary += u'</p>' # local annotation annotation = spineapi.Annotation() annotation['concept'] = 'GEO' annotation['property:name'] = 'Gene Expression Omnibus' annotation['property:sourceDatabase'] = 'geo' annotation['property:description'] = gse annotation['property:sourceDescription'] = srcdesc annotation['property:xhtml'] = xhtml.format(summary) for extent in extents: annotation.addExtent(extent) document.addAnnotation(annotation)
def after_ready_event(self, document): print 'Formatting metadata' # Find highest matching metadata accumulation list for references source = None for accListLink in document.getAccLists('metadata'): matches = document.annotationsIf({'concept': 'Citation'}, accListLink['scratch']) if len(matches) > 0: print 'Selected for [Citation] list %s with rank %s' % ( accListLink['scratch'], repr(accListLink.get('rank', 0))) source = accListLink bibliography = list(matches) bibliography.sort(key=sortfn) rt = '' for annotation in bibliography: citation = utopia.tools.utils.citation_from_annotation( annotation) rt += utopia.citation.render(citation, links=True) if len(bibliography) > 0: # Create Metadata link annotation link = document.newAccList('citation_list') link['property:list_name'] = 'Bibliography' document.addAnnotations(bibliography, link['scratch']) if len(rt) > 0: references = spineapi.Annotation() references['displayBibliography'] = rt references['concept'] = 'BibliographyMetadata' references['property:identifier'] = '#bibliography' references['property:name'] = 'Bibliography' references['displayName'] = 'Bibliography' references['displayRelevance'] = '800' if accListLink is not None: for i in ('sourceIcon', 'sourceTitle', 'sourceDescription', 'sourceDatabase'): k = 'property:{0}'.format(i) if k in accListLink: references[k] = accListLink[k] references[ 'property:description'] = 'From ' + accListLink[ 'property:sourceTitle'] document.addAnnotation(references) break if source is None: print 'No metadata found' # Find highest matching metadata accumulation list for in-text citations for accListLink in document.getAccLists('metadata'): matches = document.annotationsIf({'concept': 'ForwardCitation'}, accListLink['scratch']) if len(matches) > 0: print 'Selected for [ForwardCitation] list %s with rank %s' % ( accListLink['scratch'], repr(accListLink.get('rank', 0))) document.addAnnotations(matches) break # Find highest matching metadata accumulation list for in-text citations for accListLink in document.getAccLists('metadata'): matches = document.annotationsIf({'concept': 'Table'}, accListLink['scratch']) if len(matches) > 0: print 'Selected for [Table] list %s with rank %s' % ( accListLink['scratch'], repr(accListLink.get('rank', 0))) document.addAnnotations(matches) break metadata = None if source is not None: for annotation in document.annotations(source['scratch']): if annotation.get('concept') == 'DocumentMetadata': metadata = annotation if metadata: metadata['displayName'] = 'Document Information' metadata['displayRelevance'] = '1000' document.addAnnotation(metadata, 'Document Metadata')
def on_ready_event(self, document): # Find distinguishing ID pmid = utopia.tools.utils.metadata(document, 'identifiers[pubmed]') # Compile distinct GEO IDs in the text matches = {} for match in document.search( r'GSE\d+', spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp): matches.setdefault(match.text(), []) matches[match.text()].append(match) # Create annotations for each GEO ID for gse, extents in matches.iteritems(): entry = self._fetchGEO(gse) print entry dates = u'Submitted {0}'.format(entry['submission_date']) if 'last_update_date' in entry: dates += u'; last updated {0}'.format( entry['last_update_date']) dates += '.' dataCitation = u'''<p>{0}. <strong>{1}</strong>.</p><p>{2}<br>({3})</p><p>{4}</p>'''.format( entry['contributors'], entry['title'], entry.get('overall_design', ''), entry['type'], dates) xhtml = u'<div class="box">{0}{{0}}<p>GEO Accession: <a href="{1}">{2}</a></p></div>'.format( dataCitation, entry['GEO_url'], gse) # Removed broken InSilicoDB link #xhtml += u'<p><a href="{0}">Explore in InSilico DB...</a></p>'.format(entry['InSilicoDB_url']) srcdesc = '''<p>The <a href="http://www.ncbi.nlm.nih.gov/geo">Gene Expression Omnibus (GEO)</a> is a public repository that archives and freely distributes microarray, next-generation sequencing, and other forms of high-throughput functional genomic data submitted by the scientific community.</p>''' if entry.get('pubmed_id') == pmid: # add a global annotation annotation = spineapi.Annotation() annotation['concept'] = 'GEO' annotation['property:name'] = 'Gene Expression Omnibus' annotation['property:sourceDatabase'] = 'geo' annotation[ 'property:description'] = '{0} (Data associated with this article)'.format( gse) annotation['property:sourceDescription'] = srcdesc annotation['property:xhtml'] = xhtml.format( '') # Keep summary blank document.addAnnotation(annotation) # Generate summary summary = entry.get('summary', '') if len(summary) > 0: summary_words = summary.split(' ') summary = u'<p><em>Summary:</em> ' summary += u'{0}'.format(' '.join(summary_words[:32])) if len(summary_words) > 32: summary += u' <span class="readmore">{0}</span>'.format( ' '.join(summary_words[32:])) summary += u'</p>' # local annotation annotation = spineapi.Annotation() annotation['concept'] = 'GEO' annotation['property:name'] = 'Gene Expression Omnibus' annotation['property:sourceDatabase'] = 'geo' annotation['property:description'] = gse annotation['property:sourceDescription'] = srcdesc annotation['property:xhtml'] = xhtml.format(summary) for extent in extents: annotation.addExtent(extent) document.addAnnotation(annotation)
def on_ready_event(self, document): volume, page = None, None # Only send if the DOI has a Portland prefix doi = common.utils.metadata(document, 'doi') if doi is not None and doi[:7] in registrants: crossref_unixref = common.utils.metadata(document, 'raw_crossref_unixref') if crossref_unixref is not None: # Parse CrossRef redirect URL dom = etree.fromstring(crossref_unixref.encode('utf8')) resource = dom.findtext('doi_record/crossref/journal/journal_article/doi_data/resource') if resource is not None: match = self.resourceRegExp.match(resource) if match is not None: volume, page = match.groups() ### FIXME What information should be shown? Portland? BJ? #annotation = spineapi.Annotation() #annotation['concept'] = 'PublisherIdentity' #annotation['property:logo'] = utopia.get_plugin_data_as_url('images/logo.png', 'image/png') #annotation['property:title'] = 'Portland Press Limited' #annotation['property:webpageUrl'] = 'http://www.portlandpress.com/' #document.addAnnotation(annotation, 'PublisherMetadata') # If this document was resolved, off we go to fetch the NLM if None not in (volume, page): # Make a request to the utopia ext web service url = 'https://utopia.cs.manchester.ac.uk/ext/portland/nlm?{0}' url = url.format(urllib.urlencode({'volume': volume, 'page': page})) try: nlm = urllib2.urlopen(url, timeout=8).read() except: raise return info = common.nlm.parse(nlm) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/biochemj.png', 'image/png') link['property:sourceTitle'] = 'Portland' link['property:sourceDescription'] = ''' <p><a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p> ''' # Create Metadata annotation annotation = spineapi.Annotation() annotation['concept'] = 'DocumentMetadata' for k in self.keys: v = info.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = spineapi.Annotation() annotation['concept'] = 'DocumentReference' for k in self.keys: v = citation.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation['concept'] = 'ForwardCitation' annotation['property:state'] = 'found' if 'title' in citation: annotation['property:title'] = citation['title'] if 'id' in citation: annotation['property:bibid'] = citation['id'] if 'doi' in citation and citation['doi'].startswith('10.1371/'): citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi'])) if 'pmcid' in citation: citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid']) for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'): if k in citation: annotation['property:{0}'.format(k)] = citation[k] #print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) #print citation except: raise pass # FIXME for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict = True) #print regex matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): info = utopialib.nlm.parse( utopialib.utils.metadata(document, 'raw_pmc_nlm')) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id # Create Metadata link annotation link = document.newAccList('metadata', 50) link['property:sourceDatabase'] = 'pmc' link['property:sourceTitle'] = 'PubMed Central' link[ 'property:sourceDescription'] = '<p><a href="http://www.ncbi.nlm.nih.gov/pmc/">PubMed Central</a> is the U.S. National Institutes of Health (NIH) digital archive of biomedical and life sciences journal literature.</p>' # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation(citation) document.addAnnotation(annotation, link['scratch']) # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format(citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise # Tables for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): volume, page = None, None # Only send if the DOI has a Portland prefix doi = utopialib.utils.metadata(document, 'identifiers[doi]') if doi is not None and doi[:7] in registrants: crossref_unixref = utopialib.utils.metadata( document, 'raw_crossref_unixref') if crossref_unixref is not None: # Parse CrossRef redirect URL dom = etree.fromstring(crossref_unixref.encode('utf8')) resource = dom.findtext( 'doi_record/crossref/journal/journal_article/doi_data/resource' ) if resource is not None: match = self.resourceRegExp.match(resource) if match is not None: volume, page = match.groups() ### FIXME What information should be shown? Portland? BJ? #annotation = spineapi.Annotation() #annotation['concept'] = 'PublisherIdentity' #annotation['property:logo'] = utopia.get_plugin_data_as_url('images/logo.png', 'image/png') #annotation['property:title'] = 'Portland Press Limited' #annotation['property:webpageUrl'] = 'http://www.portlandpress.com/' #document.addAnnotation(annotation, 'PublisherMetadata') # If this document was resolved, off we go to fetch the NLM if None not in (volume, page): # Make a request to the utopia ext web service url = 'https://utopia.cs.manchester.ac.uk/ext/portland/nlm?{0}' url = url.format(urllib.urlencode({ 'volume': volume, 'page': page })) try: nlm = urllib2.urlopen(url, timeout=8).read() except: raise return info = utopialib.nlm.parse(nlm) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/biochemj.png', 'image/png') link['property:sourceTitle'] = 'Portland' link['property:sourceDescription'] = ''' <p><a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p> ''' # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation( citation) document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format( citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) #print regex matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): info = common.nlm.parse(common.utils.metadata(document, "raw_pmc_nlm")) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding="utf8") pmids = dict( ( (citation["pmid"], citation["id"]) for citation in info["citations"] if "pmid" in citation and "id" in citation ) ) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( common.eutils.efetch(id=",".join(pmids.keys()), retmode="xml", rettype="abstract"), parser ) for idList in pubmed_abstracts.xpath("PubmedArticle/PubmedData/ArticleIdList"): # print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info["citations_by_id"][pmids[pmid]] for key_name, id_name in (("doi", "doi"), ("pmcid", "pmc"), ("pii", "pii")): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id # Create Metadata link annotation link = document.newAccList("metadata", 50) link["property:sourceDatabase"] = "pmc" link["property:sourceTitle"] = "PubMed Central" link[ "property:sourceDescription" ] = '<p><a href="http://www.ncbi.nlm.nih.gov/pmc/">PubMed Central</a> is the U.S. National Institutes of Health (NIH) digital archive of biomedical and life sciences journal literature.</p>' # Create Metadata annotation annotation = spineapi.Annotation() annotation["concept"] = "DocumentMetadata" for k in self.keys: v = info.get(k) if v is not None: annotation["property:{0}".format(k)] = v document.addAnnotation(annotation, link["scratch"]) # Create Bibliography annotations for citation in info.get("citations", []): annotation = spineapi.Annotation() annotation["concept"] = "DocumentReference" for k in self.keys: v = citation.get(k) if v is not None: annotation["property:{0}".format(k)] = v document.addAnnotation(annotation, link["scratch"]) # Citations for citation in info["citations"]: # Find cross refs for pre, label, post in citation.get("contexts", []): matches = document.findInContext(pre, label, post) # print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation["concept"] = "ForwardCitation" annotation["property:state"] = "found" if "title" in citation: annotation["property:title"] = citation["title"] if "id" in citation: annotation["property:bibid"] = citation["id"] if "doi" in citation and citation["doi"].startswith("10.1371/"): citation[ "pdf" ] = "http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF".format( "info:doi/{0}".format(citation["doi"]) ) if "pmcid" in citation: citation["pdf"] = "http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/".format( citation["pmcid"] ) # print citation for k in self.keys + ("authors", "pdf", "first_author_surname"): if k in citation: annotation["property:{0}".format(k)] = citation[k] # print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link["scratch"]) # print citation except: raise pass # FIXME # Tables for id, table in info.get("tables", {}).iteritems(): if "caption" in table and "xml" in table: regex = fuzz(table["caption"], strict=True) matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation["concept"] = "Table" annotation[ "session:upload_files" ] = "data:application/xml;name=data.xml;base64,%s" % base64.standard_b64encode(table["xml"]) annotation.addExtent(matches[0]) document.addAnnotation(annotation, link["scratch"]) else: print "*********** failed to match table:", id
def after_ready_event(self, document): # Make an annotation for all these metadata ids = { 'doi': ('DOI', u'<a href="http://dx.doi.org/{0}">{0}</a>'), 'issn': ('ISSN', u'<strong>{0}</strong>'), 'pii': ('PII', u'<strong>{0}</strong>'), 'pubmed': ('Pubmed', u'<a href="http://www.ncbi.nlm.nih.gov/pubmed/{0}">{0}</a>'), 'pmc': ('PMC', u'<a href="http://www.ncbi.nlm.nih.gov/pmc/articles/{0}">{0}</a>' ), 'arxiv': ('arXiv', u'<a href="http://arxiv.org/abs/{0}">{0}</a>'), } # Build list of fragments fragments = [] pub_icon = '' html = ''' <style> .fancy_quotes { position: relative; } .fancy_quotes:before { content: "\\201C"; } .fancy_quotes:after { content: "\\201D"; } </style> ''' for key, (name, format) in ids.iteritems(): id = utopialib.utils.metadata(document, 'identifiers[{0}]'.format(key)) if id is not None: fragments.append( u'<td style="text-align: right; opacity: 0.7">{0}:</td><td>{1}</td>' .format(name, format.format(id))) issn = utopialib.utils.metadata(document, 'publication-issn') if issn is not None: fragments.append( u'<td style="text-align: right; opacity: 0.7">{0}:</td><td><strong>{1}</strong></td>' .format('ISSN', issn)) # Resolve publisher info for annotation in document.annotations('PublisherMetadata'): if annotation.get('concept') == 'PublisherIdentity': logo = annotation.get('property:logo') title = annotation.get('property:title') webpageUrl = annotation.get('property:webpageUrl') if None not in (logo, title, webpageUrl): pub_icon = u'<a href="{0}" title="{2}"><img src="{1}" alt="{2}" /></a></td>'.format( webpageUrl, logo, title) break # Compile fragments title = utopialib.utils.metadata(document, 'title') if title is not None or len(pub_icon) > 0: html += u'<table style="border: none; margin: 0 0 1em 0;">' html += u'<tr>' if title is not None: html += u'<td style="text-align:left; vertical-align: middle;"><strong class="nohyphenate fancy_quotes">{0}</strong></td>'.format( title.strip()) if len(pub_icon) > 0: html += u'<td style="text-align:right; vertical-align: middle; width: 80px;">{0}</td>'.format( pub_icon) html += u'</tr>' html += u'</table>' if len(fragments) > 0: html += u'<div class="box">' html += u'<table style="border: none">' html += u'<tr>' html += u'</tr><tr>'.join(fragments) html += u'</tr>' html += u'</table>' html += u'</div>' annotation = spineapi.Annotation() annotation['concept'] = 'Collated' annotation['property:html'] = html annotation['property:name'] = 'About this article' annotation['session:weight'] = '1000' annotation['session:default'] = '1' annotation['session:headless'] = '1' document.addAnnotation(annotation)
def on_ready_event(self, document): # Get resolved DOI doi = utopia.tools.utils.metadata(document, 'identifiers[doi]') # Only for PLOS DOIs should this plugin do anything if doi is not None and doi.startswith('10.1371/'): # Record the publisher identity information annotation = spineapi.Annotation() annotation['concept'] = 'PublisherIdentity' annotation['property:logo'] = utopia.get_plugin_data_as_url( 'images/large_logo.jpg', 'image/jpg') annotation['property:title'] = 'PLOS' annotation['property:webpageUrl'] = 'http://www.plos.org/' document.addAnnotation(annotation, 'PublisherMetadata') # Attempt to get ALMs from PLOS API query = { 'api_key': self.api_key, 'info': 'detail', 'ids': doi, 'type': 'doi' } url = 'http://alm.plos.org/api/v5/articles?{0}'.format( urllib.urlencode(query)) request = urllib2.Request(url, headers={'Accepts': 'application/json'}) try: data = urllib2.urlopen(request, timeout=8).read() alm = json.loads(data) # Not found except urllib2.HTTPError as e: if e.code == 404: # just ignore 404 return raise articles = alm.get('data', []) if len(articles) > 0: article = articles[0] metrics = dict(((source.get('name'), source.get('metrics')) for source in article.get('sources', []))) plos_pdf_views = metrics.get('counter', {}).get('pdf') or 0 plos_html_views = metrics.get('counter', {}).get('html') or 0 pmc_pdf_views = metrics.get('pmc', {}).get('pdf') or 0 pmc_html_views = metrics.get('pmc', {}).get('html') or 0 annotation = spineapi.Annotation() annotation['concept'] = 'PLOSALMRecord' annotation['property:doi'] = doi annotation['property:name'] = 'PLOS' annotation['property:description'] = 'Download statistics' annotation['property:plos_pdf_views'] = plos_pdf_views annotation['property:plos_html_views'] = plos_html_views annotation['property:pmc_pdf_views'] = pmc_pdf_views annotation['property:pmc_html_views'] = pmc_html_views annotation[ 'property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/small_logo.png', 'image/png') annotation[ 'property:sourceDescription'] = '<p><a href="http://www.plos.org/">PLOS</a> article level metrics for downloads.</p>' document.addAnnotation(annotation)
def on_ready_event(self, document): # See if we have any publishers' NLM hosted for this DOI doi = common.utils.metadata(document, 'doi') #print '----- DOI', doi if doi is not None: info = None try: url = 'https://utopia.cs.manchester.ac.uk/ext/hosted/nlm?' url += urllib.urlencode({'doi': doi.encode('utf8')}) nlm = urllib2.urlopen(url, timeout=8).read() info = common.nlm.parse(nlm) except (urllib2.URLError, socket.timeout): # info will remain None pass #print info if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Generate sensible titles / descriptions / icons? journalTitle = info.get('publication-title', '') journalTitleSuffix = '' publisher = info.get('publisher', 'the publisher') if len(journalTitle) > 0: journalTitleSuffix = ' ({0})'.format(journalTitle) # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = '' link['property:sourceTitle'] = publisher link['property:sourceDescription'] = ''' <p>This information was provided by {0}{1}.</p> '''.format(publisher, journalTitleSuffix) # Publisher identity if doi[:8] in ('10.1104/', '10.1105/'): annotation = spineapi.Annotation() annotation['concept'] = 'PublisherIdentity' logo = utopia.get_plugin_data_as_url('images/aspb_logo.png', 'image/png') webpageUrl = 'http://www.aspb.org/' title = publisher #print '====', publisher, '---', journalTitle, '---', webpageUrl if doi.startswith('10.1104/'): logo = utopia.get_plugin_data_as_url('images/pp_logo.png', 'image/png') title = journalTitle webpageUrl = 'http://www.plantphysiol.org/' elif doi.startswith('10.1105/'): logo = utopia.get_plugin_data_as_url('images/tpc_logo.png', 'image/png') title = journalTitle webpageUrl = 'http://www.plantcell.org/' annotation['property:logo'] = logo annotation['property:title'] = title annotation['property:webpageUrl'] = webpageUrl document.addAnnotation(annotation, 'PublisherMetadata') link['property:sourceIcon'] = logo link['property:sourceTitle'] = title # Create Metadata annotation annotation = spineapi.Annotation() annotation['concept'] = 'DocumentMetadata' for k in self.keys: v = info.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = spineapi.Annotation() annotation['concept'] = 'DocumentReference' for k in self.keys: v = citation.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation['concept'] = 'ForwardCitation' annotation['property:state'] = 'found' if 'title' in citation: annotation['property:title'] = citation['title'] if 'id' in citation: annotation['property:bibid'] = citation['id'] if 'doi' in citation and citation['doi'].startswith('10.1371/'): citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi'])) if 'pmcid' in citation: citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid']) for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'): if k in citation: annotation['property:{0}'.format(k)] = citation[k] #print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) #print citation except: raise pass # FIXME for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict = True) #print regex # convert oasis tables ns = {'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table'} xml = etree.fromstring(table['xml']) if xml.tag == '{{{0}}}table'.format(ns['oasis']): for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns): columns = {} for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns): columns[colspec.get('colname')] = int(colspec.get('colnum')) for section in tgroup.xpath('./oasis:thead|./oasis:tbody', namespaces=ns): isHead = (section.tag == '{{{0}}}thead'.format(ns['oasis'])) for row in section.xpath('./oasis:row', namespaces=ns): for entry in row.xpath('./oasis:entry', namespaces=ns): colname = entry.get('colname') colst = entry.get('namest') colend = entry.get('nameend') if colst is not None and colend is not None: colspan = columns[colend] - columns[colst] + 1 else: colspan = 1 if colspan > 1: entry.set('colspan', unicode(colspan)) morerows = entry.get('morerows') if morerows is not None: rowspan = int(morerows) + 1 else: rowspan = 1 if rowspan > 1: entry.set('rowspan', unicode(rowspan)) entry.tag = 'td' row.tag = 'tr' if isHead: section.tag = 'thead' else: section.tag = 'tbody' xml.append(section) xml.tag = 'table' #print etree.tostring(xml, pretty_print=True, encoding='utf8') table['xml'] = etree.tostring(xml, encoding='utf8') matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): css = ''' <style> .discussion .author { font-weight: bold; } .discussion .timestamp { font-size: 0.9em; font-style: italic; } .discussion .author a { color: black !important; } .discussion .timestamp a { color: inherit !important; } </style> ''' identifiers = utopia.tools.utils.metadata(document, 'identifiers') if identifiers is None or len(identifiers) == 0: return discussions = self._get_discussion(identifiers) if discussions is None or not isinstance(discussions, dict): return for discussion_source in discussions.get('discussions', []): comments = [] mentions = {} for comment in sorted(discussion_source.get('comments', []), key=lambda c: c['timestamp']): comments.append( u'<div class="box commnt limited-height">{}<p></p>{}</div>' .format(self._format_header(comment), comment['content'])) for mention in discussion_source.get('mentions', []): key = self._identifiers_to_key(mention['identifiers']) mentions[key] = mention['identifiers'] if not mentions and not comments: continue html = u'<div class="discussion">' a = spineapi.Annotation() a['concept'] = 'Collated' a['property:name'] = discussion_source['source']['title'] a['property:sourceDatabase'] = discussion_source['source']['title'] a['property:sourceDescription'] = '<p>{}</p>'.format( discussion_source['source']['description']) a['property:sourceIcon'] = discussion_source['source']['icon'] a['property:description'] = 'Comments related to this article' if comments: html += u'' html += u'\n\n'.join(comments) if mentions: mention_html = u'\n\n'.join([ utopia.citation.render(dict(identifiers=citation), process=True, links=True) for citation in mentions.itervalues() ]) html += u'<div class="box"><p>This article was mentioned by a comment in:</p>{}</div>\n\n'.format( mention_html) html += '</div>' a['property:html'] = css, html document.addAnnotation(a)
def on_ready_event(self, document): #print "RUNNING DRYAD PLUGIN" doi = common.utils.metadata(document, 'doi') if doi is not None: # see if kend knows about this DOI as a Dryad record response = urllib2.urlopen('https://utopia.cs.man.ac.uk/kend/0.7/define/lookup?database=dryad&term=%s&limit=1000' % doi, timeout=8) root = etree.fromstring(response.read()) dryadShortHandle = root.findtext('kend:group/kend:annotation/kend:properties/property:databaseTerm', namespaces=ns) if dryadShortHandle is not None: # then we have a dryad short-form doi, so can now safely go to dryad to get the rest response = urllib2.urlopen('http://datadryad.org/solr/search/select/?q=dc.relation.isreferencedby:%s&fl=dc.identifier,dc.title_ac,dc.identifier.uri,dc.contributor.author,dc.date.issued.year,dc.identifier.citation,dc.description' % doi, timeout=8) root = etree.fromstring(response.read()) #print etree.tostring(root, pretty_print=True, encoding='utf8') result = root.find('result') if result.attrib['numFound'] != '0': # then we have found some datasets for this article DOI packageDetails = urllib2.urlopen('http://datadryad.org/metadata/handle/%s/mets.xml' % dryadShortHandle, timeout=8) root = etree.fromstring(packageDetails.read()) #print etree.tostring(root, pretty_print=True, encoding='utf8') identifiers = root.findall('mets:dmdSec/mets:mdWrap/mets:xmlData/dim:dim/dim:field[@element="identifier"]', namespaces=ns) packageDOI = None for identifier in identifiers: if identifier.text.startswith('doi:'): packageDOI = identifier.text[4:] break contributors = root.findall('mets:dmdSec/mets:mdWrap/mets:xmlData/dim:dim/dim:field[@qualifier="author"]', namespaces=ns) dataCitation = { 'year': result.findtext("doc/arr[@name='dc.date.issued.year']/int", namespaces=ns), 'title': root.findtext('mets:dmdSec/mets:mdWrap/mets:xmlData/dim:dim/dim:field[@element="title"]', namespaces=ns), 'authors': [string.capwords(a.text) for a in contributors], 'source': 'Dryad Digital Repository', 'doi': packageDOI, } articleCitation = root.findtext('mets:dmdSec/mets:mdWrap/mets:xmlData/dim:dim/dim:field[@qualifier="citation"][@element="identifier"]', namespaces=ns) xhtml = ''' <p> The data associated with this article are available via Dryad. When using these data, please cite both the article: </p> <div class="box">{0}<br /><a href="http://dx.doi.org/{1}">doi:{1}</a></div> <p> and also the data package: </p> <div class="box">{2}<br /><a href="http://dx.doi.org/{3}">doi:{3}</a></div> '''.format(articleCitation, doi, common.utils.format_citation(dataCitation), dataCitation['doi']) a = spineapi.Annotation() a['concept'] = 'Dryad' a['property:name'] = 'Dryad' a['property:sourceDatabase'] = 'dryad' a['property:sourceDescription'] = '<p><a href="http://datadryad.org/">Dryad</a> is an international repository of data underlying peer-reviewed articles in the basic and applied biosciences.</p>' a['property:description'] = 'Data associated with this article' a['property:xhtml'] = xhtml document.addAnnotation(a)
def on_filter_event(self, document, data=None): for a in document.annotations(): if a.get( 'author' ) == 'http://utopia.cs.manchester.ac.uk/users/11679' and a.get( 'concept') in ('Definition', 'DatabaseEntry' ) and 'session:legacy' not in a: document.removeAnnotation(a) identifier = a.get('property:identifier', '') if identifier.startswith('http://bio2rdf.org/pdb:'): # PDB entry a2 = spineapi.Annotation() a2['concept'] = 'DatabaseEntry' a2['author'] = a['author'] a2['session:volatile'] = '1' a2['session:legacy'] = '1' a2['property:sourceDatabase'] = 'pdb' a2['property:sourceDescription'] = '<p>The <a href="http://www.rcsb.org/">Protein Data Bank</a> of the Research Collaboratory for Structural Bioinformatics (<a href="http://home.rcsb.org/">RCSB</a>).</p>' a2['property:identifier'] = identifier a2['property:description'] = 'PDB entry {0}'.format( identifier[-4:].upper()) if 'property:name' in a: a2['property:name'] = a['property:name'][:-11] if 'property:imageUrl' in a: a2['property:imageUrl'] = a['property:imageUrl'] if 'property:molecularDescription' in a: a2['property:molecularDescription'] = a[ 'property:molecularDescription'] if 'property:webpageUrl' in a: a2['property:webpageUrl'] = a['property:webpageUrl'] if 'property:embedded' in a: a2['property:embedded'] = a['property:embedded'] for extent in a.extents(): a2.addExtent(extent) for area in a.areas(): a2.addArea(area) document.addAnnotation(a2) if identifier.startswith('http://dbpedia.org/resource/'): # Wikipedia entry a2 = spineapi.Annotation() a2['concept'] = 'Definition' a2['author'] = a['author'] a2['session:volatile'] = '1' a2['session:legacy'] = '1' a2['property:sourceDatabase'] = 'wikipedia' a2['property:sourceDescription'] = '<p>Structured <a href="http://www.wikipedia.org/">Wikipedia</a> information provided by the <a href="http://DBpedia.org/">DBpedia</a> project.</p>' a2['property:description'] = a.get('property:summary', 'Wikipedia entry') if 'property:name' in a: a2['property:name'] = a['property:name'] if 'property:identifier' in a: a2['property:identifier'] = a['property:identifier'] if 'property:imageUrl' in a: a2['property:imageUrl'] = a['property:imageUrl'] if 'property:summary' in a: a2['property:summary'] = a['property:summary'] if 'property:webpageUrl' in a: a2['property:webpageUrl'] = a['property:webpageUrl'] for extent in a.extents(): a2.addExtent(extent) for area in a.areas(): a2.addArea(area) document.addAnnotation(a2) if identifier.startswith( 'http://www.portlandpress.com/utopia/glick/'): # Wikipedia entry a2 = spineapi.Annotation() a2['concept'] = 'Definition' a2['author'] = a['author'] a2['session:volatile'] = '1' a2['session:legacy'] = '1' a2['property:sourceDatabase'] = 'glick' a2['property:sourceDescription'] = '<p>David M. Glick\'s <a href="http://www.portlandpress.com/pp/books/online/glick/search.htm">Glossary of Biochemistry and Molecular Biology</a>.</p><p>Made available by <a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p>' a2['property:description'] = a[ 'property:description'] + '<p><em>(Glick Glossary/Portland Press Ltd.)</em></p>' a2['property:name'] = a['property:name'] for extent in a.extents(): a2.addExtent(extent) for area in a.areas(): a2.addArea(area) document.addAnnotation(a2)
def after_ready_event(self, document): # Get (if present) the RSCMetadataLink annotation for annotation in document.annotations(): if annotation.get('concept') == 'RSCMetadataLink': text = document.text() doi = annotation['property:doi'].upper() rscId = annotation['property:rscId'].upper() xmlquery = '<SearchCriteria><SearchTerm><Category>Journal</Category><ContentType>All</ContentType><Criterias><NameValue><Name>FreeText</Name><Value>"%s"</Value></NameValue></Criterias><Source>Utopia</Source></SearchTerm><PageNo>1</PageNo><PageSize>10</PageSize><SortBy>Relevance</SortBy></SearchCriteria>' % doi baseurl = 'http://pubs.rsc.org/en/federated/search' params = { 'federatedsearchname': 'Utopia', 'inputxml': xmlquery } url = baseurl + '?%s' % urllib.urlencode(params) searchresult = urllib2.urlopen(url, timeout=14).read() root = etree.fromstring(searchresult) #print etree.tostring(root, pretty_print=True, encoding='utf8') articles = root.findall('./{http://www.rsc.org/schema/rscart38}article') #print articles # the search use above can return more than one article, so select out only the one with # the correct doi thearticle = None articleID = None for article in articles: found_doi = article.findtext("./{http://www.rsc.org/schema/rscart38}metainfo/{http://www.rsc.org/schema/rscart38}meta[@field='doi']") if found_doi is None: found_doi = article.findtext("./{http://www.rsc.org/schema/rscart38}art-admin/{http://www.rsc.org/schema/rscart38}doi") if found_doi is not None and found_doi.upper() == doi: thearticle = article articleIDelem = article.find("./{http://www.rsc.org/schema/rscart38}metainfo/{http://www.rsc.org/schema/rscart38}meta[@field='docid']") if articleIDelem is not None: articleID = articleIDelem.text break # if we get back a single valid article... if thearticle != None: #print articleID compoundsInArticle = [] compoundText = {} annotationsInArticle = [] annotationText = {} # create a list of all the compounds that are mentioned in the article body compnames = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-body/{http://www.rsc.org/schema/rscart38}compname') #print compnames for compname in compnames: # This line removes (erroneous?) elements from inside the XML etree.strip_elements(compname, '{http://www.rsc.org/schema/rscart38}compound', with_tail=False) #print compname.attrib['idrefs'], compname.text compoundsInArticle.append(compname.attrib['idrefs']) compoundText[compname.attrib['idrefs']] = etree.tounicode(compname, method='text') annotationnames = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-body/{http://www.rsc.org/schema/rscart38}annref') #print annotationnames for annotationname in annotationnames: # This line removes (erroneous?) elements from inside the XML etree.strip_elements(annotationname, '{http://www.rsc.org/schema/rscart38}annotation', with_tail=False) #print annotationname.attrib['idrefs'], annotationname.text annotationsInArticle.append(annotationname.attrib['idrefs']) annotationText[annotationname.attrib['idrefs']] = etree.tounicode(annotationname, method='text') #print compoundText, annotationText #sprint annotationsInArticle # then for all the compounds that are defined in the article back compounds = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-back/{http://www.rsc.org/schema/rscart38}compound') for compound in compounds: id = compound.attrib['id'] if id in compoundsInArticle: url = None id = compound.attrib['id'] # if the compound has a CSID, then the URL links to the chemspider page csid = compound.find("./{http://www.rsc.org/schema/rscart38}link[@type='CSID']" ) # if the compound has a CSID, create a Chemspider URL for it if csid is not None and csid.text is not None: url = 'http://www.chemspider.com/Chemical-Structure.%s.html' % csid.text[5:] else: # otherwise, use the RSC landing page url = 'http://www.rsc.org/publishing/journals/prospect/cheminfo.asp?XMLID=%s&compoundtext=%s&MSID=%s' % (id[4:], compoundText[id], articleID) if url is not None: options = spineapi.WholeWordsOnly + spineapi.IgnoreCase matches = document.search(compoundText[id], options) annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = url for match in matches: annotation.addExtent(match) document.addAnnotation(annotation) # similarly, for all the annotations annotations = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-back/{http://www.rsc.org/schema/rscart38}annotation') for annotation in annotations: id = annotation.attrib['id'] url = None if id in annotationsInArticle: id = annotation.attrib['id'] # get the link id link = annotation.findtext("./{http://www.rsc.org/schema/rscart38}link" ) # if the compound has a link, create an RSC ontology landing page for it if link is not None: if link[:3] == 'AU:': url = 'http://goldbook.iupac.org/%s.html' % link[3:] else: url = 'http://www.rsc.org/publishing/journals/prospect/ontology.asp?id=%s&MSID=%s' % (link, articleID) if url is not None: matches = document.search(annotationText[id], spineapi.IgnoreCase + spineapi.WholeWordsOnly) annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = url for match in matches: annotation.addExtent(match) document.addAnnotation(annotation) break
def on_ready_event(self, document): '''Fetch information from the Lazarus service''' permission = self.get_config('permission', False) if permission: # If an outline already exists, don't make a new one needs_outline = True for annotation in document.annotations(): if annotation.get('concept') == 'OutlineItem': needs_outline = False break # The Lazarus server needs to know what this document is document_id = utopia.tools.utils.metadata(document, 'identifiers[utopia]') this_doi = utopia.tools.utils.metadata(document, 'identifiers[doi]') if this_doi is not None: this_doi = u'doi:' + this_doi # Speak to server params = {'fingerprint': document.fingerprints()} url = '{0}?{1}'.format(laz_docUrl, urllib.urlencode(params, doseq=True)) response = urllib2.urlopen(url, timeout=60) if response.getcode() == 204: request = urllib2.Request( url, data=document.data(), headers={'Content-Type': 'application/pdf'}) response = urllib2.urlopen(request, timeout=60) #response = open('/Users/dave/Desktop/ananiadou_tibtech06.pdf-response.xml', 'r') # Create Metadata link annotation link = document.newAccList('metadata', 50) link['property:sourceDatabase'] = 'lazarus' link['property:sourceTitle'] = 'Lazarus' link['property:sourceDescription'] = self.sourceDescription link['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/lazarus-prefs-logo.png', 'image/png') headers = [] pos = [] refs = [] annotations = [] concepts = {} hits = [] expression_annotations = [] for kAnnotation in kend.converter.XML.parse( response, kend.model.Document): #print kend.converter.XML.serialise(kAnnotation)[0] try: annotations.append( utopia.tools.converters.Annotation.kend2spineapi( kAnnotation, document)) except: pass annotations.sort(key=lambda a: int(a.get('structure:order', 0))) for sAnnotation in annotations: if sAnnotation['concept'] == 'structure_element': role, level = self.getHeaderRole(sAnnotation) if role is not None and needs_outline: while len(pos) < level: pos.append(0) while len(pos) > level: pos.pop() pos[-1] += 1 outline = u'.'.join([unicode(i) for i in pos]) anchor_name = '#lazarus.outline.{0}'.format(outline) anchor = spineapi.Annotation() anchor['concept'] = 'Anchor' anchor['property:anchor'] = anchor_name anchor.addExtents(sAnnotation.extents()) anchor.addAreas(sAnnotation.areas()) document.addAnnotation(anchor) header = spineapi.Annotation() header['concept'] = 'OutlineItem' header['property:outlinePosition'] = outline header['property:outlineTitle'] = u' '.join( [e.text() for e in sAnnotation.extents()]) header['property:destinationAnchorName'] = anchor_name document.addAnnotation(header) print((u' ' * level + u'.'.join([unicode(i) for i in pos]) + u' ' + u' '.join([ e.text() for e in sAnnotation.extents() ])).encode('utf8')) elif 'bibitem' in sAnnotation.getAllProperties( 'structure:role'): #refs.append(sAnnotation) pass elif sAnnotation['concept'] == 'Citation': # Hack to fix a mistake in authors property name if 'property:author' in sAnnotation and not 'property:authors' in sAnnotation: sAnnotation[ 'property:authors'] = sAnnotation.getAllProperties( 'property:author') refs.append(sAnnotation) elif sAnnotation['concept'] == 'LazarusConcept': concept_id = sAnnotation.get('property:identifier') if concept_id is not None: sAnnotation['id'] = str(uuid.uuid4()) concepts[concept_id] = sAnnotation document.addAnnotation(sAnnotation, 'Lazarus Concept') elif sAnnotation['concept'] == 'LazarusConceptHit': hits.append(sAnnotation) elif sAnnotation['concept'] == 'LazarusSentenceExpression': expression_annotations.append(sAnnotation) else: document.addAnnotation(sAnnotation) for ref in refs: #print(ref.get('structure:order', '0')) pass refs = sorted(refs, key=lambda ref: int(ref.get('property:order', '0'))) for ref in refs: #print(ref.get('structure:order', '0')) pass for ref in refs: # Create Bibliography annotations #citation = {'unstructured': u' '.join([e.text() for e in ref.extents()])} #annotation = utopia.tools.utils.citation_to_annotation(citation) #annotation['property:order'] = ref.get('structure:order') #annotation.addExtents(ref.extents()) #annotation.addAreas(ref.areas()) #document.addAnnotation(annotation, link['scratch']) document.addAnnotation(ref, link['scratch']) # Now link hits to concepts for i, hit in enumerate(hits): concept_id = hit.get('property:identifier') concept = concepts.get(concept_id) if concept is not None: concept_uuid = concept.get('id') hit['property:concept_id'] = concept_uuid identifier = concept.get('property:identifier') name = concept.get('property:name', '???') sources = concept.get('property:externalSources', 'json:[]') if sources.startswith('json:'): sources = json.loads(sources[5:]) if 'property:stdInchiKey' in concept: sources.append({ 'database': ' InchiKey', 'identifier': concept['property:stdInchiKey'] }) if 'property:canonicalSmiles' in concept: sources.append({ 'database': ' SMILES', 'identifier': concept['property:canonicalSmiles'] }) kind = concept.get('property:kind') kind = self.dbs.get(kind, {}).get('title', kind) links = {} for source in sources: uri = source.get('uri') if 'primary' in source.get('relationship', []): links.setdefault('definition', []) links['definition'].append(u''' <a href="{uri}" title="{uri}">{database}</a> '''.format(**source)) elif uri is None: if source.get('database') in (' InchiKey', ' SMILES'): links.setdefault('main', []) links['main'].append(u''' <tr><td>{database}:</td><td>{identifier}</td></tr> '''.format(**source)) else: identifier = source.get('identifier') links_category = 'xref' if 'seeAlso' in source.get('relationship', []) or uri is None: links_category = 'seeAlso' links.setdefault(links_category, []) if identifier is not None: links[links_category].append(u''' <a href="{uri}" title="{uri}">{name}...</a> ({identifier}) '''.format(**source)) else: links[links_category].append(u''' <a href="{uri}" title="{uri}">{name}...</a> '''.format(**source)) style = u''' <style> .lazarus-table tbody { border: none; } .lazarus-table td:first-of-type { text-align: right; font-weight: bold; } .lazarus-table td { vertical-align: top; } .lazarus-table td:first-of-type { white-space: nowrap; } .lazarus-table td:not(:first-of-type) { word-break: break-all; } .lazarus-table tr td { padding-top: 0ex; padding-bottom: 0ex; } .lazarus-table tbody:not(:first-of-type) tr:first-of-type td { padding-top: 1ex; } </style> ''' html = u''' <table class="lazarus-table"> <tr><td>Name:</td><td>{name}</td></tr> '''.format(**{'name': name}) categories = { 'xref': 'Related:', 'seeAlso': 'See also:', 'definition': 'Defined in:' } for links_category in ('main', 'xref', 'seeAlso', 'definition'): links_title = categories.get(links_category) these_links = sorted( list(set(links.get(links_category, [])))) if len(these_links) > 0: html += '<tbody>' if links_category != 'main': html += u'<tr><td>{0}</td><td>'.format( links_title) html += u'<br>'.join(these_links) html += '</td></tr>' else: html += ''.join(these_links) html += '</tbody>' #pprint('------------------------') html += u''' </table> ''' #print(html) hasLinks = len( links.get('xref', []) + links.get('seeAlso', [])) > 0 ann = spineapi.Annotation() ann['concept'] = 'Collated' ann['property:name'] = u'{0}'.format(name) ann['property:description'] = 'Lazarus Concept' ann['session:semanticTerm'] = name ann['property:html'] = [style, html] ann['property:sourceDescription'] = self.sourceDescription ann['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/lazarus-prefs-logo.png', 'image/png') ann['session:overlay'] = 'hyperlink' ann['session:color'] = '#880000' count = 0 print('====', 7) if 'property:hitFragments' in hit: hitFragments = hit.getAllProperties( 'property:hitFragments') or [] #pprint(hitFragments) for hitFragment in hitFragments: pre, _, rest = hitFragment.partition('{!') match, _, post = rest.partition('!}') #pprint((pre, match, post)) matches = document.findInContext(pre, match, post, fuzzy=True) count += len(matches) ann.addExtents(matches) if hasLinks and count > 0: document.addAnnotation(ann) style = u''' <style> .lazarus-expression .box { background-color: #FFF0E8; border-color: #EEE0D8; } .lazarus-related { padding-left: 42px; background-image: url(%s); background-repeat: no-repeat; background-position: top left; background-size: 37px 48px; min-height: 53px; } .lazarus-related + .lazarus-related { margin-top: 5px; border-top: 1px dotted #aaa; padding-top: 5px; background-position-y: 5px; min-height: 58px; } .lazarus-sentence { padding-left: 0.5em; color: black; } .lazarus-sentence.negative { border-left: solid 5px #bb0000; } .lazarus-sentence.positive { border-left: solid 5px #008800; } .lazarus-sentence.negative a { color: #bb0000; } .lazarus-sentence.positive a { color: #008800; } </style> ''' % utopia.get_plugin_data_as_url('images/pdf-page-icon.png', 'image/png') expressions = [] for sAnnotation in expression_annotations: exp = sAnnotation.get('property:expressions', 'json:{}') if exp.startswith('json:'): exp = json.loads(exp[5:]) context = sAnnotation.get('property:context') if context is not None: if exp.get('negative', False): exp['posneg'] = 'negative' else: exp['posneg'] = 'positive' pprint(context) pprint(exp) matched_context = exp.get('context') matches = [] if matched_context is not None: matches = document.search( re.sub(r'\s+', ' ', matched_context)) if len(matches) > 0: anchor_id = str(uuid.uuid4())[1:-1] anchor = spineapi.Annotation() anchor['concept'] = 'Anchor' anchor['property:anchor'] = anchor_id anchor.addExtents(matches) document.addAnnotation(anchor) exp.update({ 'anchor_id': anchor_id, 'sentence': context }) expressions.append(exp) js = u''' <script> $(document).on('DOMNodeInserted', function(e) { var element = e.target; $(element).filter('a[target="tab"]').add('a[target="tab"]', element).each(function () { var fragment = $(this).closest('.-papyro-internal-citation').data('citation')['userdef']['first_fragment']; $(this).attr('target', 'pdf; show=highlight; text=[' + encodeURIComponent(fragment) + ']'); }); }); $(function () { var lazarus = { expressions: %s, fingerprints: %s, relUrl: %s }; var more_expressions_link = $('#lazarus-expression > p.more').hide(); var more_expressions_spinner = $('#lazarus-expression > div.spinner'); Spinners.create(more_expressions_spinner); Spinners.play(more_expressions_spinner); var exp_divs = []; var identifiers = []; for (var e = 0; e < lazarus.expressions.length; e++) { var expression = lazarus.expressions[e]; var exp_div = $('<div class="box"></div>'); exp_div.data('expression', expression); exp_div.hide(); exp_divs.push(exp_div); identifiers.push(expression.identifiers); } var params = { fingerprint: lazarus.fingerprints }; var url = lazarus.relUrl + '?' + $.param(params, traditional=true); $.ajax({ url: url, type: 'POST', dataType: 'json', data: JSON.stringify(identifiers), contentType: "application/json", error: function (xhr, ajaxOptions, thrownError) { console.log(xhr.statusText); console.log(xhr.responseText); console.log(xhr.status); console.log(thrownError); // FIXME do something here Spinners.remove(more_expressions_spinner); }, success: function (related) { // Sort related according to the number of articles found related.results.sort(function (l, r) { var lv = Object.keys(l.related).length; var rv = Object.keys(r.related).length; return (lv > rv) ? -1 : (lv < rv) ? 1 : 0; }); $.each(related.results, function (idx, result) { var exp_div = exp_divs[idx]; var expression = exp_div.data('expression'); expression.related = result.related; delete expression.related[%s]; split = expression.sentence.split(expression.context); pre = split[0]; pre = pre.replace(/(\w)$/, '$1 '); pre = pre.replace(/^\s*/, ''); match = expression.context; post = split[1]; post = post.replace(/^(\w)/, ' $1'); post = post.replace(/\s*$/, ''); expression.pre = pre; expression.match = match; expression.post = post; // Create expression element exp_div.append('<p class="lazarus-sentence ' + expression.posneg + '">“' + expression.pre + '<a target="pdf; show=select; anchor=' + expression.anchor_id + '"><strong>' + expression.match + '</strong></a>' + expression.post + '”</p>'); exp_div.data('expression', expression); $('#lazarus-expression > .content').append(exp_div); if (Object.keys(expression.related).length > 0) { var related_div = $('<div class="expandable" title="Related expressions elsewhere"></div>'); var related_div_content = $('<div></div>').appendTo(related_div); function on_expand() { related_div.off('papyro:expandable:expand', on_expand); $.each(expression.related, function (idx, obj) { fragments = []; $.each(obj, function (id, obj) { fragments.push(obj.context); }); fragments.join('\\n'); related_div_content.append($('<div class="lazarus-related unprocessed"></div>').append('<p><strong>“…'+fragments+'…”</strong></p>').hide().data('citation', {identifiers:{doi:idx},userdef:{first_fragment:fragments[0]}})); // .append(utopia.citation.render({identifiers:{doi:idx},first_fragment:fragments[0]}, true, true)) }); expression.related.length = 0; // empty for future if ($('.lazarus-related.unprocessed', exp_div).length > 0) { var more = $('<p class="more right"><a class="more">More related articles...</a></p>'); related_div_content.append(more); function show_five_related(e) { e.preventDefault(); $('.lazarus-related.unprocessed', exp_div).slice(0, 5).each(function (idx, obj) { var citation = $(obj).data('citation'); $(obj).append(utopia.citation.render(citation, true, true)); $(obj).show().removeClass('unprocessed'); }); if ($('.lazarus-related.unprocessed', exp_div).length == 0) { more.remove(); } } more.on('click', show_five_related).click(); } } related_div.on('papyro:expandable:expand', on_expand); exp_div.append(related_div); utopia.processNewContent(related_div); } }); Spinners.remove(more_expressions_spinner); more_expressions_link.show(); $('a.more', more_expressions_link).click(); } }); function append_five(e) { e.preventDefault(); // Show the next five $('#lazarus-expression > .content').children().filter(':hidden').slice(0,5).show(); // Hide the 'more' link if everything is now visible if ($('#lazarus-expression > .content').children().filter(':hidden').length == 0) { more_expressions_link.hide(); } } // Hook up 'more' link $('#lazarus-expression > p.more > a.more').on('click', append_five).click(); }); </script> ''' % (json.dumps(expressions), json.dumps( document.fingerprints()), json.dumps(laz_docRelUrl), json.dumps(this_doi)) #print(js.encode('utf8')) html = u''' <div id="lazarus-expression"><div class="content"></div><div class="spinner"></div><p class="more"><a class="more">More expressions...</a></p></div> ''' if len(expressions) > 0: ann = spineapi.Annotation() ann['concept'] = 'Collated' ann['property:name'] = 'Lazarus Expressions' ann['property:description'] = u'Summarizing expression(s)' ann['property:html'] = [js, style, html] ann['property:sourceDescription'] = self.sourceDescription ann['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/lazarus-prefs-logo.png', 'image/png') document.addAnnotation(ann) else: # no permission noprompt = self.get_config('noprompt', False) if not noprompt: annotation = spineapi.Annotation() annotation['concept'] = 'Collated' params = { 'uuid': self.uuid(), } annotation['property:html'] = utopia.get_plugin_data( 'tpl/denied.html').format(**params) annotation['property:name'] = 'Lazarus' annotation[ 'property:description'] = 'Lazarus functionality is turned off' annotation[ 'property:sourceDescription'] = self.sourceDescription annotation[ 'property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/lazarus-prefs-logo.png', 'image/png') annotation['session:default'] = '1' document.addAnnotation(annotation)
def on_ready_event(self, document): doi = common.utils.metadata(document, 'doi') if doi is not None: info = {} # Resolve the DOI to find the publisher's website response = urllib2.urlopen('http://dx.doi.org/{0}'.format(doi), timeout=8) # Parse page to find (if there) the full text URL parser = etree.HTMLParser() html = etree.parse(response, parser) # Only continue if this is a highwire HTML page if len(html.xpath("/html/head/meta[@name='HW.identifier']")) == 0: return # Now make sure we have the full text XHTML citation_fulltext_html_url = html.xpath("/html/head/meta[@name='citation_fulltext_html_url']/@content") if len(citation_fulltext_html_url) > 0: citation_fulltext_html_url = citation_fulltext_html_url[0] # Fetch that full text page (if different to the current one) if citation_fulltext_html_url != response.geturl(): response = urllib2.urlopen(citation_fulltext_html_url, timeout=8) html = etree.parse(response, parser) #print etree.tostring(html, pretty_print=True, encoding='utf8') # Now parse out the bibliography info['citations'] = [] info['citations_by_id'] = {} for bibitem in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' ref-list ')]//ol[contains(concat(' ', normalize-space(@class), ' '), ' cit-list ')]/li"): citation = query(bibitem, { 'id': 'a/@id', 'label': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' ref-label ')]/text()", 'title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-article-title ')]/text()", 'year': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-date ')]/text()", 'publication-title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-jnl-abbrev ')]/text()", 'volume': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-vol ')]/text()", 'issue': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-issue ')]/text()", 'pagefrom': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-fpage ')]/text()", 'pageto': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-lpage ')]/text()", 'pmid': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-pmid ')]/text()", 'doi': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-doi ')]/text()", 'etree': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-metadata ')]", }) authors = [] for a in bibitem.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-auth ')]"): surname = a.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-surname ')]/text()") given_names = a.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-given-names ')]/text()") if len(surname) > 0 and len(given_names) > 0: authors.append(u'{0}, {1}'.format(surname[0], given_names[0]).strip(', ')) if len(authors) > 0: citation['authors'] = authors citation['contexts'] = [] citation['displayText'] = common.utils.format_citation(citation) info['citations'].append(citation) info['citations_by_id'][citation['id']] = citation #print citation ####################################################################################### # Parse in-text citations if present min_length = 10 max_length = 20 for paragraph in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' article ')]//p"): text_stack = [paragraph.text or ''] xref_stack = [None] for elem in paragraph: if len(elem.xpath('self::a[@class="xref-bibr"]')) > 0: text_stack.append(etree.tostring(elem, method='text', encoding=unicode, with_tail=False)) text_stack.append(elem.tail or '') xref = info['citations_by_id'].get(elem.get('href', '')[1:]) if xref is not None: xref_stack += [[xref], None] else: xref_stack += [[], None] elif isinstance(elem, etree._Entity): points = entities.get(elem.text[1:-1]) if points is not None: text_stack[-1] += ''.join((unichr(p) for p in points)) else: text_stack[-1] += etree.tostring(elem, encoding=unicode) else: if elem.get('position') == 'float': text_stack[-1] += elem.tail or '' else: text_stack[-1] += etree.tostring(elem, method='text', encoding=unicode) # Find and collapse ranges in the text for i in xrange(len(xref_stack) - 3, 1, -2): text = text_stack[i].strip() #print len(text), (text in u'-\u2010\u2011\u2012\u2013\u2014\u2015'), u''.join(text_stack[i-1:i+2]).encode('utf8') # if this text is a dash, we need to coalesce the text fragments if len(text) == 1 and text in u'-\u2010\u2011\u2012\u2013\u2014\u2015': text_stack[i-1:i+2] = [u''.join(text_stack[i-1:i+2])] xref_stack[i-1:i+2] = [xref_stack[i-1] + xref_stack[i+1]] #for text in text_stack: # print text.encode('utf8') # Then make sure we resolve the implied citations for i in xrange(1, len(xref_stack), 2): # Get actual cross references xrefs = xref_stack[i] # Expand cross references try: if len(xrefs) == 2: labelfrom = int(xrefs[0].get('label')) labelto = int(xrefs[1].get('label')) candidates = {} midlabels = [unicode(midlabel) for midlabel in xrange(labelfrom+1, labelto)] for candidate in info['citations']: if candidate.get('label') in midlabels: candidates[int(candidate.get('label'))] = candidate xrefs[1:-1] = candidates.values() except: raise # Find and collapse lists in the text for i in xrange(len(xref_stack) - 3, 1, -2): text = text_stack[i].strip() # if this text is a comma, we need to coalesce the text fragments if len(text) == 1 and text == ',': text_stack[i-1:i+2] = [u''.join(text_stack[i-1:i+2])] xref_stack[i-1:i+2] = [xref_stack[i-1] + xref_stack[i+1]] # Expand citations to include brackets (on both sides) for i in xrange(len(xref_stack) - 2, 0, -2): before = text_stack[i-1].strip()[-1:] text = text_stack[i].strip() after = text_stack[i+1].strip()[:1] # if this text is a comma, we need to coalesce the text fragments #print before.encode('utf'), after.encode('utf') if len(before) > 0 and before in '({[' and len(after) > 0 and after in ')}]': text_stack[i-1] = re.sub(r'[({[](\s*)$', r'\1', text_stack[i-1]) text_stack[i+1] = re.sub(r'^(\s*)[)}\]]', r'\1', text_stack[i+1]) text_stack[i] = before + text_stack[i] + after #print repr(text_stack) for i in xrange(1, len(xref_stack), 2): # Get context before = u' '.join(text_stack[:i]).strip() label = text_stack[i].strip() after = u' '.join(text_stack[i+1:]).strip() # Strip out extraneous brackets if len(xref_stack[i]) > 1: # Hack to differentiate single / multiple citations # as multiple numbers tend not to have spaces between them label = re.sub(ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,])\s?[({[]?', r'\1', label) else: label = re.sub(ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,]\s?)[({[]?', r'\1', label) # Normalise context before = re.sub(r'\s+', ' ', before)[-max_length:].strip() label = re.sub(r'\s+', ' ', label) after = re.sub(r'\s+', ' ', after)[:max_length].strip() #print (before.encode('utf8'), label.encode('utf8'), after.encode('utf8')) if len(before + after) > min_length: for xref in xref_stack[i]: xref['contexts'].append((before, label, after)) #print xref_stack[i] ####################################################################################### # Parse tables if present info['tables'] = {} for table_url in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' table-inline ')]/div[contains(concat(' ', normalize-space(@class), ' '), ' callout ')]//li[1]/a/@href"): table_url = urlparse.urljoin(citation_fulltext_html_url, table_url) #print table_url response = urllib2.urlopen(table_url, timeout=8) table_html = etree.parse(response, parser) for table_expansion in table_html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' table-expansion ')]"): id = table_expansion.get('id') table = {} table['xml'] = table_expansion.xpath('.//table[1]')[0] table['caption_raw'] = table_expansion.xpath(".//span[contains(concat(' ', normalize-space(@class), ' '), ' caption-title ')][1]")[0] if 'caption' not in table and 'caption_raw' in table: table['caption'] = table['caption_raw'] if 'caption' in table: table['caption'] = re.sub(r'\s+', ' ', etree.tostring(table['caption'], method='text', encoding=unicode).strip()) if 'xml' in table: table['xml'] = etree.tostring(table['xml'], encoding='utf8') info['tables'][id] = table #print table #print info if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Generate sensible titles / descriptions / icons? journalTitle = info.get('publication-title', '') journalTitleSuffix = '' publisher = info.get('publisher', 'the publisher') if len(journalTitle) > 0: journalTitleSuffix = ' ({0})'.format(journalTitle) # Create Metadata link annotation link = document.newAccList('metadata', 90) link['property:sourceIcon'] = '' link['property:sourceTitle'] = publisher link['property:sourceDescription'] = ''' <p>This information was provided by {0}{1}.</p> '''.format(publisher, journalTitleSuffix) # Create Metadata annotation annotation = spineapi.Annotation() annotation['concept'] = 'DocumentMetadata' for k in self.keys: v = info.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = spineapi.Annotation() annotation['concept'] = 'DocumentReference' for k in self.keys: v = citation.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): #print (pre, label, post) matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation['concept'] = 'ForwardCitation' annotation['property:state'] = 'found' if 'title' in citation: annotation['property:title'] = citation['title'] if 'id' in citation: annotation['property:bibid'] = citation['id'] if 'doi' in citation and citation['doi'].startswith('10.1371/'): citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi'])) if 'pmcid' in citation: citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid']) for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'): if k in citation: annotation['property:{0}'.format(k)] = citation[k] #print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) #print citation except: raise pass # FIXME for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict = True) #print regex # convert oasis tables ns = {'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table'} xml = etree.fromstring(table['xml']) if xml.tag == '{{{0}}}table'.format(ns['oasis']): for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns): columns = {} for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns): columns[colspec.get('colname')] = int(colspec.get('colnum')) for section in tgroup.xpath('./oasis:thead|./oasis:tbody', namespaces=ns): isHead = (section.tag == '{{{0}}}thead'.format(ns['oasis'])) for row in section.xpath('./oasis:row', namespaces=ns): for entry in row.xpath('./oasis:entry', namespaces=ns): colname = entry.get('colname') colst = entry.get('namest') colend = entry.get('nameend') if colst is not None and colend is not None: colspan = columns[colend] - columns[colst] + 1 else: colspan = 1 if colspan > 1: entry.set('colspan', unicode(colspan)) morerows = entry.get('morerows') if morerows is not None: rowspan = int(morerows) + 1 else: rowspan = 1 if rowspan > 1: entry.set('rowspan', unicode(rowspan)) entry.tag = 'td' row.tag = 'tr' if isHead: section.tag = 'thead' else: section.tag = 'tbody' xml.append(section) xml.tag = 'table' #print etree.tostring(xml, pretty_print=True, encoding='utf8') table['xml'] = etree.tostring(xml, encoding='utf8') matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): # Only send if the DOI has a Springer prefix doi = utopialib.utils.metadata(document, 'identifiers[doi]') if doi is not None and doi[:7] in registrants: annotation = spineapi.Annotation() annotation['concept'] = 'PublisherIdentity' if False and doi.startswith( '10.1186/'): # This turns out not to be reliable annotation['property:logo'] = utopia.get_plugin_data_as_url( 'images/gigascience_logo.png', 'image/png') annotation['property:title'] = 'Giga Science' annotation[ 'property:webpageUrl'] = 'http://www.gigasciencejournal.com/' else: annotation['property:logo'] = utopia.get_plugin_data_as_url( 'images/logo.png', 'image/png') annotation['property:title'] = 'Springer' annotation['property:webpageUrl'] = 'http://www.springer.com/' document.addAnnotation(annotation, 'PublisherMetadata') # Make a request to the utopia ext web service url = 'https://utopia.cs.manchester.ac.uk/ext/springer/nlm?{0}' url = url.format(urllib.urlencode({'doi': doi})) try: nlm = urllib2.urlopen(url, timeout=8).read() except (urllib2.URLError, socket.timeout): return info = utopialib.nlm.parse(nlm) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/annotation_icon.png', 'image/png') link['property:sourceTitle'] = 'Springer' link['property:sourceDescription'] = ''' <p><a href="http://www.springer.com/">Springer</a> publishing company.</p> ''' # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation( citation) document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format( citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) print regex matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_filter_event(self, document, data = None): for a in document.annotations(): if a.get('author') == 'http://utopia.cs.manchester.ac.uk/users/11679' and a.get('concept') in ('Definition', 'DatabaseEntry') and 'session:legacy' not in a: document.removeAnnotation(a) identifier = a.get('property:identifier', '') if identifier.startswith('http://bio2rdf.org/pdb:'): # PDB entry a2 = spineapi.Annotation() a2['concept'] = 'DatabaseEntry' a2['author'] = a['author'] a2['session:volatile'] = '1' a2['session:legacy'] = '1' a2['property:sourceDatabase'] = 'pdb' a2['property:sourceDescription'] = '<p>The <a href="http://www.rcsb.org/">Protein Data Bank</a> of the Research Collaboratory for Structural Bioinformatics (<a href="http://home.rcsb.org/">RCSB</a>).</p>' a2['property:identifier'] = identifier a2['property:description'] = 'PDB entry {0}'.format(identifier[-4:].upper()) if 'property:name' in a: a2['property:name'] = a['property:name'][:-11] if 'property:imageUrl' in a: a2['property:imageUrl'] = a['property:imageUrl'] if 'property:molecularDescription' in a: a2['property:molecularDescription'] = a['property:molecularDescription'] if 'property:webpageUrl' in a: a2['property:webpageUrl'] = a['property:webpageUrl'] if 'property:embedded' in a: a2['property:embedded'] = a['property:embedded'] for extent in a.extents(): a2.addExtent(extent) for area in a.areas(): a2.addArea(area) document.addAnnotation(a2) if identifier.startswith('http://dbpedia.org/resource/'): # Wikipedia entry a2 = spineapi.Annotation() a2['concept'] = 'Definition' a2['author'] = a['author'] a2['session:volatile'] = '1' a2['session:legacy'] = '1' a2['property:sourceDatabase'] = 'wikipedia' a2['property:sourceDescription'] = '<p>Structured <a href="http://www.wikipedia.org/">Wikipedia</a> information provided by the <a href="http://DBpedia.org/">DBpedia</a> project.</p>' a2['property:description'] = a.get('property:summary', 'Wikipedia entry') if 'property:name' in a: a2['property:name'] = a['property:name'] if 'property:identifier' in a: a2['property:identifier'] = a['property:identifier'] if 'property:imageUrl' in a: a2['property:imageUrl'] = a['property:imageUrl'] if 'property:summary' in a: a2['property:summary'] = a['property:summary'] if 'property:webpageUrl' in a: a2['property:webpageUrl'] = a['property:webpageUrl'] for extent in a.extents(): a2.addExtent(extent) for area in a.areas(): a2.addArea(area) document.addAnnotation(a2) if identifier.startswith('http://www.portlandpress.com/utopia/glick/'): # Wikipedia entry a2 = spineapi.Annotation() a2['concept'] = 'Definition' a2['author'] = a['author'] a2['session:volatile'] = '1' a2['session:legacy'] = '1' a2['property:sourceDatabase'] = 'glick' a2['property:sourceDescription'] = '<p>David M. Glick\'s <a href="http://www.portlandpress.com/pp/books/online/glick/search.htm">Glossary of Biochemistry and Molecular Biology</a>.</p><p>Made available by <a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p>' a2['property:description'] = a['property:description'] + '<p><em>(Glick Glossary/Portland Press Ltd.)</em></p>' a2['property:name'] = a['property:name'] for extent in a.extents(): a2.addExtent(extent) for area in a.areas(): a2.addArea(area) document.addAnnotation(a2)