def after_load_event(self, document):
        '''
        Provide formatted citations for the current document. This should be done quite
        late in the process, in case previous handlers have enriched the metadata.
        '''

        # Start by getting all the best-trusted metadata for this document
        metadata = {}
        for key in self.properties + self.identifiers:
            value = common.utils.metadata(document, key)
            if value is not None:
                if key[-2:] == '[]':
                    key = key[:-2]
                metadata[key] = value

        # Only if there's some metadata to display FIXME
        if len(metadata) > 0:
            # Now create a citation formatter annotation for the sidebar
            annotation = spineapi.Annotation()
            annotation['concept'] = 'CitationFormatter'
            annotation['property:json'] = json.dumps(metadata)
            annotation['property:name'] = 'Formatted Citation'
            annotation['property:description'] = "How to cite this document"
            annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/csl.png', 'image/png')
            annotation['property:sourceDescription'] = '''
                Uses <a href="https://bitbucket.org/fbennett/citeproc-js/wiki/Home">citeproc-js</a>
                to format the citation.
            '''
            annotation['session:weight'] = '10'
            annotation['session:default'] = '1'
            annotation['session:volatile'] = '1'
            document.addAnnotation(annotation)
    def on_load_event(self, document):
        outline={}

        for a in document.annotations():
            if a.get('concept') == 'OutlineItem':
                outline[tuple([int(x) for x in a.get('property:outlinePosition').split('.')])]=a

        if len(outline):
#            html='<div><style media="screen" type="text/css">ul { list-style-type: none; }</style><ul>'
            html='<div><ul>'
            plen=1
            for item in (sorted(outline.keys())):

                if len(item) > plen:
                    html+='<ul><li>'
                elif len(item) < plen:
                    html+='</li></ul></li><li>'
                else:
                    html+='</li><li>'
                plen=len(item)

                html += '<a href="#" title="{0}" target="pdf; anchor={0}">{1}</a>'.format(outline[item].get('property:destinationAnchorName'), cgi.escape(outline[item].get('property:outlineTitle'), quote=True).encode('ascii', 'xmlcharrefreplace'),)

            html+="</ul></div>"
            a = spineapi.Annotation()
            a['concept'] = 'Collated'
            a['property:name'] = 'Outline'
            a['property:description'] = 'Document Structure'
            a['session:weight'] = '10000'
            a['property:html'] = html
            document.addAnnotation(a)
Example #3
0
    def on_ready_event(self, document):
        doi = utopia.tools.utils.metadata(document, 'identifiers[doi]')
        if doi is not None:
            try:
                # Check to see if the DOI is known
                url = 'http://api.altmetric.com/{0}/doi/{2}?key={1}'.format(
                    self.api_version, self.key, doi)
                data = urllib2.urlopen(url, timeout=8).read()
                json.loads(
                    data
                )  # Just check this is possible - throws exception otherwise

                a = spineapi.Annotation()
                a['concept'] = 'Altmetric'
                a['property:doi'] = doi
                a['property:json'] = data
                a['property:name'] = 'Altmetric'
                a['property:description'] = 'Who is talking about this article?'
                a['property:sourceDatabase'] = 'altmetric'
                a['property:sourceDescription'] = '<p>Discover, track and analyse online activity related to this article with <a href="http://www.altmetric.com/">Altmetric</a>.</p>'
                a['session:weight'] = '1'
                a['session:default'] = '1'
                document.addAnnotation(a)
            except (urllib2.URLError, socket.timeout):
                pass
    def on_activate_event(self, document):
        if len(document.annotations('GPCRDB cache')) == 0:
            print 'annotating stuff . . .'

            pubmedId = utopialib.utils.metadata(document,
                                                'identifiers[pubmed]')
            if pubmedId is not None:
                print 'found pubmed id: ' + pubmedId
            else:
                print 'did not find pubmed id'

            ns = {'r': 'GPCR'}

            textMentions = self.getMentions(document.text(), pubmedId)

            objectlist = []
            mention_cache = {}
            for mention in textMentions:
                if mention.mentionType != 'SPECIES':
                    mention_cache.setdefault(mention.html, [])
                    mention_cache[mention.html].append(mention)

            for html, mentions in mention_cache.iteritems():
                annotation = self.createAnnotation(document, html, mentions)
                annotation['displayRelevance'] = '2000'
                annotation['displayRank'] = '2000'
                document.addAnnotation(annotation)

            document.addAnnotation(spineapi.Annotation(), 'GPCRDB cache')
 def on_load_event(self, document):
     # Email links
     for match in document.search(self.email, spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp):
         if not areas_intersect(match.areas(), self.existing_areas):
             annotation = spineapi.Annotation()
             annotation['concept'] = 'Hyperlink'
             annotation['property:webpageUrl'] = 'mailto:%s' % match.text()
             annotation['session:volatile'] = '1'
             annotation.addExtent(match)
             document.addAnnotation(annotation)
         else:
             print 'ignoring clashing email link text:', match.text().encode('utf8')
     # HTTP(S) links
     for match in document.search(self.http, spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp):
         if not areas_intersect(match.areas(), self.existing_areas):
             if match.begin().lineArea()[1] == 0: # Only while vertical links are rendered wrongly FIXME
                 url = match.text()
                 if not url.startswith('http'):
                     url = 'http://' + url
                 annotation = spineapi.Annotation()
                 annotation['concept'] = 'Hyperlink'
                 annotation['property:webpageUrl'] = '%s' % url
                 annotation['session:volatile'] = '1'
                 annotation.addExtent(match)
                 document.addAnnotation(annotation)
         else:
             print 'ignoring clashing http link text:', match.text().encode('utf8')
    def after_ready_event(self, document):
        outline={}

        for a in document.annotations():
            if a.get('concept') == 'OutlineItem':
                outline[tuple([int(x) for x in a.get('property:outlinePosition').split('.')])]=a

        if len(outline):
#            html='<div><style media="screen" type="text/css">ul { list-style-type: none; }</style><ul>'
            html='<div><ul>'
            plen=1
            for item in (sorted(outline.keys())):

                if len(item) > plen:
                    html+='<ul><li>'
                elif len(item) < plen:
                    html+='</li></ul></li><li>'
                else:
                    html+='</li><li>'
                plen=len(item)

                html += '<a href="#" title="{0}" target="pdf; anchor={0}">{1}</a>'.format(outline[item].get('property:destinationAnchorName'), cgi.escape(outline[item].get('property:outlineTitle'), quote=True).encode('ascii', 'xmlcharrefreplace'),)

            html+="</ul></div>"
            a = spineapi.Annotation()
            a['concept'] = 'Collated'
            a['property:name'] = 'Outline'
            a['property:description'] = 'Document Structure'
            a['session:weight'] = '999'
            a['property:html'] = html
            document.addAnnotation(a)
 def on_load_event(self, document):
     # Email links
     for match in document.search(self.email, spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp):
         if not areas_intersect(match.areas(), self.existing_areas):
             annotation = spineapi.Annotation()
             annotation['concept'] = 'Hyperlink'
             annotation['property:webpageUrl'] = 'mailto:%s' % match.text()
             annotation['session:volatile'] = '1'
             annotation.addExtent(match)
             document.addAnnotation(annotation)
         else:
             print('ignoring clashing email link text:', match.text().encode('utf8'))
     # HTTP(S) links
     for match in document.search(self.http, spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp):
         if not areas_intersect(match.areas(), self.existing_areas):
             if match.begin().lineArea()[1] == 0: # Only while vertical links are rendered wrongly FIXME
                 url = match.text()
                 if not url.startswith('http'):
                     url = 'http://' + url
                 annotation = spineapi.Annotation()
                 annotation['concept'] = 'Hyperlink'
                 annotation['property:webpageUrl'] = '%s' % url
                 annotation['session:volatile'] = '1'
                 annotation.addExtent(match)
                 document.addAnnotation(annotation)
         else:
             print('ignoring clashing http link text:', match.text().encode('utf8'))
 def add_success(component, method):
     error = spineapi.Annotation()
     error["concept"] = "Success"
     error["property:component"] = component
     error["property:method"] = method
     error["property:category"] = "success"
     document.addAnnotation(error, "errors.metadata")
    def on_activate_event(self, document):
        if len(document.annotations('NucleaRDB cache')) == 0:
            print 'annotating stuff . . .'

            pubmedId = common.utils.metadata(document, 'pmid')
            if pubmedId is not None:
                print 'found pubmed id: ' + pubmedId
            else:
                print 'did not find pubmed id'

            ns = {'r': 'GPCR'}

            textMentions = self.getMentions(document.text(), pubmedId)

            objectlist = []
            mention_cache = {}
            for mention in textMentions:
                if mention.mentionType != 'SPECIES':
                    mention_cache.setdefault(mention.html, [])
                    mention_cache[mention.html].append(mention)

            for html, mentions in mention_cache.iteritems():
                annotation = self.createAnnotation(document, html, mentions)
                annotation['displayRelevance']='2000'
                annotation['displayRank']= '2000'
                document.addAnnotation(annotation)

            document.addAnnotation(spineapi.Annotation(), 'NucleaRDB cache')
    def on_ready_event(self, document):
        # See if there is any CrossMark information available for this document

        # Firstly find the document's DOI
        doi = common.utils.metadata(document, 'doi')
        if doi is not None:

            # Then attempt to access CrossMark API
            try:
                url = 'http://crossmark.crossref.org/crossmark/?doi={0}'.format(doi)
                headers = {'Accept': 'application/json'}
                request = urllib2.Request(url, None, headers)
                cm = json.loads(urllib2.urlopen(request, timeout=8).read())
            # Not found
            except urllib2.HTTPError as e:
                if e.code == 404: # just ignore 404
                    return
                raise

            # If successful, create an annotation to be visualised
            annotation = spineapi.Annotation()
            annotation['concept'] = 'CrossMarkNotice'
            annotation['property:doi'] = doi
            annotation['property:name'] = 'CrossMark'
            annotation['property:description'] = 'Information on updates, corrections and retractions'
            annotation['property:sourceDatabase'] = 'crossmark'
            annotation['property:sourceDescription'] = '<div><a href="http://www.crossref.org/crossmark/">CrossMark</a> gives scholars the information they need to verify that they are using the most recent and reliable versions of a document.</div>'
            document.addAnnotation(annotation)
Example #11
0
    def on_ready_event(self, document):
        # See if there is any CrossMark information available for this document

        # Firstly find the document's DOI
        doi = utopia.tools.utils.metadata(document, 'identifiers[doi]')
        if doi is not None:

            # Then attempt to access CrossMark API
            try:
                url = 'http://crossmark.crossref.org/crossmark/?doi={0}'.format(
                    doi)
                headers = {'Accept': 'application/json'}
                request = urllib2.Request(url, None, headers)
                cm = json.loads(urllib2.urlopen(request, timeout=8).read())
            # Not found
            except urllib2.HTTPError as e:
                if e.code == 404:  # just ignore 404
                    return
                raise

            # If successful, create an annotation to be visualised
            annotation = spineapi.Annotation()
            annotation['concept'] = 'CrossMarkNotice'
            annotation['property:doi'] = doi
            annotation['property:name'] = 'CrossMark'
            annotation[
                'property:description'] = 'Information on updates, corrections and retractions'
            annotation['property:sourceDatabase'] = 'crossmark'
            annotation[
                'property:sourceDescription'] = '<div><a href="http://www.crossref.org/crossmark/">CrossMark</a> gives scholars the information they need to verify that they are using the most recent and reliable versions of a document.</div>'
            document.addAnnotation(annotation)
    def after_load_event(self, document):
        '''
        Provide formatted citations for the current document. This should be done quite
        late in the process, in case previous handlers have enriched the metadata.
        '''

        # Start by getting all the best-trusted metadata for this document
        metadata = {}
        for key in self.properties:
            value = utopia.tools.utils.metadata(document, key)
            if value is not None:
                if key[-2:] == '[]':
                    key = key[:-2]
                metadata[key] = value

        # Only if there's some metadata to display FIXME
        if len(metadata) > 0:
            # Now create a citation formatter annotation for the sidebar
            annotation = spineapi.Annotation()
            annotation['concept'] = 'CitationFormatter'
            annotation['property:json'] = json.dumps(metadata)
            annotation['property:name'] = 'Formatted Citation'
            annotation['property:description'] = 'How to cite this document'
            annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url(
                'images/csl.png', 'image/png')
            annotation['property:sourceDescription'] = '''
                Uses <a href="https://bitbucket.org/fbennett/citeproc-js/wiki/Home">citeproc-js</a>
                to format the citation.
            '''
            annotation['session:weight'] = '10'
            annotation['session:default'] = '1'
            annotation['session:volatile'] = '1'
            document.addAnnotation(annotation)
    def on_ready_event(self, document):
        logger.debug('calling citeproc populate')
        doi = common.utils.metadata(document, 'doi')
        crossref_unixref = common.utils.metadata(document, 'raw_crossref_unixref')
        # Only bother for those documents that returned a crossref document
        if doi is not None and crossref_unixref is not None:
            #load styles and locales here
            stylesJson = urllib2.urlopen(self.stylesUrl, timeout=8).read()
            logger.debug(stylesJson)

            localesJson = urllib2.urlopen(self.localesUrl, timeout=8).read()
            logger.debug(localesJson)

            a = spineapi.Annotation()
            a['concept'] = 'CiteProc'
            a['property:doi'] = doi
            a['property:text'] = self.loadingMsg
            a['property:styles'] = stylesJson
            a['property:locales'] = localesJson
            a['property:name'] = 'CrossRef'
            a['property:description'] = 'Formatted citation for this article'
            a['property:sourceDatabase'] = 'crossref'
            a['property:sourceDescription'] = '<p><a href="http://www.crossref.org/">CrossRef</a> is the official DOI link registration agency for scholarly and professional publications.</p>'
            a['session:weight'] = '10'
            a['session:default'] = '1'
            document.addAnnotation(a)
    def on_ready_event(self, document):
        # Place a link on the document to test the Javascript messaging functionality

        # self.postToBus('bioprodict', 'prepare')
        username = self.get_config("username")
        password = self.get_config("password")

        if self.validUsernameAndPassword(username, password):
            try:
                databases = self.getAvailableDatabases(username, password)
                databaseIds = []
                databaseDescriptions = []
                for database in databases:
                    databaseIds.append(database["databaseId"])
                    databaseDescriptions.append(database["databaseDescription"])

                annotation = Annotation()
                annotation["concept"] = "Bio3DMInformation"
                annotation["property:name"] = "Bio-Prodict 3DM"
                annotation["property:html"] = "html"
                annotation["property:description"] = """Annotate this document with one of your 3DM systems"""
                annotation["property:databaseIds"] = "|".join(databaseIds)
                annotation["property:databaseDescriptions"] = "|".join(databaseDescriptions)
                annotation["property:sourceDatabase"] = "bioprodict"
                annotation[
                    "property:sourceDescription"
                ] = '<p><a href="http://www.bio-prodict.nl">Bio-Prodict\'s</a> 3DM information systems provide protein family-specific annotations for this article</p>'

                # a.addExtent(document.substr(100, 300))
                document.addAnnotation(annotation)
            except WebFault as detail:
                print "Exception:", detail
Example #15
0
    def on_ready_event(self, document):
        username = self.get_config('username')
        password = self.get_config('password')

        if self.validUsernameAndPassword(username, password):
            # Get a new bearer token
            basic = 'Basic dXRvcGlhLXBsdWdpbjo='  # base64.encodestring('utopia-plugin:').replace('\n', '')
            data = dict(username=username, password=password, grant_type='password')
            content = post_for_json(self.tokenurl, basic, data)
            self.bearer = 'Bearer ' + content['access_token']
            self.proteinJs = self.proteinJs.replace('#TOKEN#', self.bearer)
            self.commonJs = self.commonJs.replace('#TOKEN#', self.bearer)

            # Get available databases for user
            databases = post_for_json(self.databasesurl, self.bearer)
            sorted_databases = sorted(databases.items(), key=lambda item: item[1])
            databaseIds = [item[0] for item in sorted_databases]
            databaseDescriptions = [item[1] for item in sorted_databases]

            annotation = Annotation()
            annotation['concept'] = 'Bio3DMInformation'
            annotation['property:name'] = 'Bio-Prodict 3DM'
            annotation['property:html'] = 'html'
            annotation['session:overlay'] = 'hyperlink'
            annotation['session:color'] = '#336611'
            annotation['property:description'] = '''Annotate using one of your 3DM systems'''
            annotation['property:databaseIds'] = '|'.join(databaseIds)
            annotation['property:databaseDescriptions'] = '|'.join(databaseDescriptions)
            annotation['property:sourceDatabase'] = 'bioprodict'
            annotation['property:sourceDescription'] = '<p><a href="http://www.bio-prodict.nl">Bio-Prodict\'s</a> 3DM information systems provide protein family-specific annotations for this article</p>'

            document.addAnnotation(annotation)
Example #16
0
    def on_activate_event(self, document, data={}):
        action = data.get('action')
        domain = data.get('domain')

        if self.annotatedDomains is None:
            self.annotatedDomains = []

        if action == 'annotate':
            print 'starting 3DM annotation . . .'
            pubmedId = utopia.tools.utils.metadata(document, 'identifiers[pubmed]')
            if pubmedId is None:
                pubmedId = '0'
            print 'sending text to remote server (' + pubmedId + '). . .'
            textMentions = self.getMentions(domain, document.text())
            print 'received response, adding annotations for domain ' + domain + ' . . .'
            mention_cache = {}
            for mention in textMentions:
                if mention['mentionType'] != 'SPECIES' and mention['mentionType'] != 'PDB':
                    html, css, js = self.buildHtml(domain, mention)
                    mention['html'] = html.encode('utf-8')
                    mention['css'] = css.encode('utf-8')
                    mention['js'] = js.encode('utf-8')
                    mention_cache.setdefault(mention['html'], [])
                    mention_cache[mention['html']].append(mention)

            for html, mentions in mention_cache.iteritems():
                annotation = self.createAnnotation(domain, document, html, mentions)
                annotation['displayRelevance'] = '2000'
                annotation['displayRank'] = '2000'
                document.addAnnotation(annotation)

            document.addAnnotation(Annotation(), domain)
            print 'done adding annotations.'
Example #17
0
    def on_ready_event(self, document):
        logger.debug('calling citeproc populate')
        doi = utopialib.utils.metadata(document, 'identifiers[doi]')
        crossref_unixref = utopialib.utils.metadata(document,
                                                    'raw_crossref_unixref')
        # Only bother for those documents that returned a crossref document
        if doi is not None and crossref_unixref is not None:
            #load styles and locales here
            stylesJson = urllib2.urlopen(self.stylesUrl, timeout=8).read()
            logger.debug(stylesJson)

            localesJson = urllib2.urlopen(self.localesUrl, timeout=8).read()
            logger.debug(localesJson)

            a = spineapi.Annotation()
            a['concept'] = 'CiteProc'
            a['property:doi'] = doi
            a['property:text'] = self.loadingMsg
            a['property:styles'] = stylesJson
            a['property:locales'] = localesJson
            a['property:name'] = 'CrossRef'
            a['property:description'] = 'Formatted citation for this article'
            a['property:sourceDatabase'] = 'crossref'
            a['property:sourceDescription'] = '<p><a href="http://www.crossref.org/">CrossRef</a> is the official DOI link registration agency for scholarly and professional publications.</p>'
            a['session:weight'] = '10'
            a['session:default'] = '1'
            document.addAnnotation(a)
 def on_ready_event(self, document):
     # Find distinguishing ID
     pmid = common.utils.metadata(document, 'pmid')
     if pmid:
         print "Found pmid:", pmid
         for annotation in self.on_explore_event(phrase=pmid, document=document):
             annotation['property:description'] = 'Human genomic information related to this article'
             document.addAnnotation(annotation)
 def on_ready_event(self, document):
     # Find distinguishing ID
     pmid = utopia.tools.utils.metadata(document, 'identifiers[pubmed]')
     if pmid:
         print "Found pmid:", pmid
         for annotation in self.on_explore_event(phrase=pmid,
                                                 document=document):
             annotation[
                 'property:description'] = 'Human genomic information related to this article'
             document.addAnnotation(annotation)
    def on_filter_event(self, document, data = None):
        for annotation in document.annotations():
            if annotation.get('concept') != 'DemoLogoOverlay' and annotation.get('property:demo_logo') == '1':
                annotation.removePropertyAll('property:demo_logo')

                overlay = spineapi.Annotation()
                overlay['concept'] = 'DemoLogoOverlay'
                overlay['property:demo_logo'] = '1'
                overlay.addExtents(annotation.extents())
                overlay.addAreas(annotation.areas())
                document.addAnnotation(overlay)
    def on_persist_event(self, document):
        client = kend.client.Client()

        document_id = utopia.tools.utils.metadata(document,
                                                  'identifiers[utopia]')
        if document_id is not None:
            for annotation in document.annotations('PersistQueue'):
                if 'session:volatile' not in annotation:
                    try:
                        ka = kend.converter.Annotation.spineapi2kend(
                            annotation, document_id)
                        ka.context = self._context_

                        updated = client.persistAnnotation(
                            ka, context=self._context_)

                        if isinstance(updated, kend.model.Annotation):
                            for key in ('id', 'created', 'author', 'revision',
                                        'edit', 'media_edit'):
                                annotation[key] = getattr(updated, key)
                            annotation.removePropertyAll('session:media')
                            for media in updated.media:
                                mediaDict = {}
                                for k in [
                                        'name', 'src', 'type', 'revision',
                                        'size', 'edit'
                                ]:
                                    if hasattr(media, k):
                                        mediaDict[k] = getattr(media, k)
                                annotation.insertProperty(
                                    'session:media',
                                    urllib.urlencode(mediaDict))
                            document.removeAnnotation(annotation,
                                                      'PersistQueue')
                            document.addAnnotation(annotation)
                    except:
                        raise
                        pass

            for annotation in document.annotations(
                    document.deletedItemsScratchId()):
                try:
                    if 'session:volatile' not in annotation:
                        ka = kend.converter.Annotation.spineapi2kend(
                            annotation, document_id)
                        client.deleteAnnotation(ka)
                    document.removeAnnotation(annotation,
                                              document.deletedItemsScratchId())
                    document.removeAnnotation(annotation)
                except:
                    raise
                    pass
    def on_ready_event(self, document):
        # Get resolved DOI
        doi = common.utils.metadata(document, 'doi', '')

        # Only for PLOS DOIs should this plugin do anything
        if doi.startswith('10.1371/'):

            # Record the publisher identity information
            annotation = spineapi.Annotation()
            annotation['concept'] = 'PublisherIdentity'
            annotation['property:logo'] = utopia.get_plugin_data_as_url('images/large_logo.jpg', 'image/jpg')
            annotation['property:title'] = 'PLOS'
            annotation['property:webpageUrl'] = 'http://www.plos.org/'
            document.addAnnotation(annotation, 'PublisherMetadata')

            # Attempt to get ALMs from PLOS API
            url = 'http://alm.plos.org/articles/{0}.json?{{0}}'.format(doi)
            query = { 'api_key': self.api_key, 'events': '1', 'source': 'counter,pmc' }
            url = url.format(urllib.urlencode(query))
            try:
                alm_events = json.loads(urllib2.urlopen(url, timeout=8).read())
            # Not found
            except urllib2.HTTPError as e:
                if e.code == 404: # just ignore 404
                    return
                raise

            plos_pdf_views = 0
            plos_html_views = 0
            pmc_pdf_views = 0
            pmc_html_views = 0

            for source in alm_events.get('article', {}).get('source', []):
                if source.get('source') == 'Counter':
                    events = source.get('events', [])
                    plos_pdf_views, plos_html_views = reduce(lambda accum,event: (accum[0]+int(event.get('pdf_views', 0)),accum[1]+int(event.get('html_views', 0))), events, (0, 0))
                elif source.get('source') == 'PubMed Central Usage Stats':
                    events = source.get('events', [])
                    pmc_pdf_views, pmc_html_views = reduce(lambda accum,event: (accum[0]+int(event.get('pdf', 0)),accum[1]+int(event.get('full-text', 0))), events, (0, 0))

            annotation = spineapi.Annotation()
            annotation['concept'] = 'PLOSALMRecord'
            annotation['property:doi'] = doi
            annotation['property:name'] = 'PLOS'
            annotation['property:description'] = 'Download statistics'
            annotation['property:plos_pdf_views'] = plos_pdf_views
            annotation['property:plos_html_views'] = plos_html_views
            annotation['property:pmc_pdf_views'] = pmc_pdf_views
            annotation['property:pmc_html_views'] = pmc_html_views
            annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/small_logo.png', 'image/png')
            annotation['property:sourceDescription'] = '<p><a href="http://www.plos.org/">PLOS</a> article level metrics for downloads.</p>'
            document.addAnnotation(annotation)
    def on_filter_event(self, document, data=None):
        for annotation in document.annotations():
            if annotation.get(
                    'concept') != 'DemoLogoOverlay' and annotation.get(
                        'property:demo_logo') == '1':
                annotation.removePropertyAll('property:demo_logo')

                overlay = spineapi.Annotation()
                overlay['concept'] = 'DemoLogoOverlay'
                overlay['property:demo_logo'] = '1'
                overlay.addExtents(annotation.extents())
                overlay.addAreas(annotation.areas())
                document.addAnnotation(overlay)
    def on_ready_event(self, document):
        pmid = common.utils.metadata(document, 'pmid')
        if pmid is not None:
            xhtml = ''

            params = {
                'app_id': self.app_id,
                'app_key': self.app_key,
                'i': pmid,
            }
            url = '{0}/DocumentEntitiesService?{1}'.format(
                self.app_uri, urllib.urlencode(params))
            response = urllib2.urlopen(url, timeout=15).read()
            results = json.loads(response.decode('latin1'))
            if results[
                    'RESP_SYS_STATUS'] == 'STAT_OK' and 'RESP_PAYLOAD' in results and len(
                        results['RESP_PAYLOAD']) > 0:
                xhtml += '<h2>Related entities</h2>'
                for entity in results['RESP_PAYLOAD']:
                    xhtml += '<p><strong><a href="{0}">{1}</a></strong> ({2})</p>'.format(
                        entity['bestLink'], entity['entityName'],
                        entity['entityTypeDisplay'])

            params = {
                'app_id': self.app_id,
                'app_key': self.app_key,
                'i': pmid,
                'n': '10',
            }
            url = '{0}/DocumentToNewsService?{1}'.format(
                self.app_uri, urllib.urlencode(params))
            response = urllib2.urlopen(url, timeout=15).read()
            results = json.loads(response.decode('latin1'))
            if results[
                    'RESP_SYS_STATUS'] == 'STAT_OK' and 'RESP_PAYLOAD' in results and len(
                        results['RESP_PAYLOAD']) > 0:
                xhtml += '<h2>Related news</h2>'
                for bite in results['RESP_PAYLOAD']:
                    xhtml += self.renderBite(bite)

            if len(xhtml) > 0:
                a = spineapi.Annotation()
                a['concept'] = 'SciBite'
                a['property:pmid'] = pmid
                a['property:name'] = 'SciBite'
                a['property:sourceDatabase'] = 'scibite'
                a['property:xhtml'] = xhtml
                a['property:description'] = 'Biomedical News & Intelligence'
                a['property:sourceDescription'] = '<p><a href="http://scibite.com/">SciBite</a> scans 1000s of papers, patents, blogs, newsfeeds and more to bring you daily alerts on critical topics in biomedicine.</p>'
                document.addAnnotation(a)
Example #25
0
    def on_ready_event(self, document):
        doi = utopialib.utils.metadata(document, 'identifiers[doi]')

        # Find and aggregate AGI instances in the document
        matches_by_agi = {}
        for match in document.search(self.agiRegex, spineapi.RegExp +
                                     spineapi.WholeWordsOnly):
            agi = match.text()
            matches_by_agi.setdefault(agi, [])
            matches_by_agi[agi].append(match)

        # For each AGI add a new bit of HTML
        if len(matches_by_agi) > 0:
            for agi, matches in matches_by_agi.iteritems():
                html = '''
                <p style="overflow: auto; width: 100%">
                  <strong>{0}</strong>
                  <span style="float: right">{1}</span>
                </p>
                '''

                annotation = spineapi.Annotation()
                annotation['concept'] = 'AGI'
                annotation['property:agi'] = agi
                annotation['property:name'] = 'Plant gene databases'
                annotation[
                    'property:description'] = 'American Society of Plant Biologists'
                annotation[
                    'property:sourceIcon'] = utopia.get_plugin_data_as_url(
                        'images/aspb_logo.png', 'image/png')
                if doi is not None:
                    if doi.startswith('10.1104/'):
                        annotation[
                            'property:sourceIcon'] = utopia.get_plugin_data_as_url(
                                'images/pp_logo.png', 'image/png')
                        annotation[
                            'property:description'] = 'From Plant Physiology'
                    elif doi.startswith('10.1105/'):
                        annotation[
                            'property:sourceIcon'] = utopia.get_plugin_data_as_url(
                                'images/tpc_logo.png', 'image/png')
                        annotation['property:description'] = 'From Plant Cell'
                annotation['property:sourceDescription'] = '''
                  <p>
                    The <a href="http://www.aspb.org/">American Society of Plant Biologists</a>
                    have deemed these linked databases important sources of information.
                  </p>
                '''
                annotation.addExtents(matches)
                document.addAnnotation(annotation)
    def on_ready_event(self, document):
        issn = utopia.tools.utils.metadata(document, 'publication-issn')
        doi = utopia.tools.utils.metadata(document, 'identifiers[doi]')
        if issn is not None:
            params = {'versions': 'all', 'issn': issn, 'ak': self.apiKey}

            url = 'http://www.sherpa.ac.uk/romeo/api29.php?' + urllib.urlencode(
                params)
            srResponse = urllib2.urlopen(url, timeout=8)
            srData = srResponse.read()

            root = etree.fromstring(srData)
            #print etree.tostring(root, pretty_print=True, encoding='utf8')

            colour = root.find('publishers/publisher/romeocolour')

            if colour is not None:
                a = spineapi.Annotation()
                a['concept'] = 'SherpaRomeo'
                a['property:doi'] = doi
                a['property:name'] = 'Sherpa/RoMEO'
                a['property:sourceDatabase'] = 'sherparomeo'
                a['property:sourceDescription'] = '<p><a href="http://www.sherpa.ac.uk/romeo/">SHERPA/RoMEO</a> provides information about publisher copyright policies for this article.</p>'
                a['property:description'] = "Archiving status is '" + colour.text + "'."

                explanation = {}
                explanation[
                    'green'] = "the author can archive pre-print <em>and</em> post-print or publisher's version/PDF"
                explanation[
                    'blue'] = "the author can archive post-print (i.e. final draft post-refereeing) or publisher's version/PDF"
                explanation[
                    'yellow'] = "the author can archive pre-print (i.e. pre-refereeing)"
                explanation[
                    'white'] = "archiving of this article not formally supported"

                journalTitle = root.find('journals/journal/jtitle')
                publisherName = root.find('publishers/publisher/name')
                publisherURL = root.find('publishers/publisher/homeurl')

                xhtml = "<p>"
                xhtml = xhtml + 'This ' + journalTitle.text + ' article, published by <a href="' + publisherURL.text + '">' + publisherName.text + '</a>, is classified as being <a href="http://www.sherpa.ac.uk/romeo/definitions.php">RoMEO ' + colour.text + '</a>. '
                xhtml = xhtml + 'This means that ' + explanation[
                    colour.text] + '.</p>'
                xhtml = xhtml + '<p>Other <a href="http://www.sherpa.ac.uk/romeo/issn/%s/">details and conditions</a> apply.</p>' % issn
                a['property:xhtml'] = xhtml

                document.addAnnotation(a)
    def on_ready_event(self, document):
        issn = common.utils.metadata(document, 'issn')
        doi = common.utils.metadata(document, 'doi')
        if issn is not None:
            params = {
                'versions': 'all',
                'issn': issn,
                'ak': self.apiKey
            }

            url = 'http://www.sherpa.ac.uk/romeo/api29.php?' + urllib.urlencode(params)
            srResponse = urllib2.urlopen(url, timeout=8)
            srData = srResponse.read()

            root = etree.fromstring(srData)
            #print etree.tostring(root, pretty_print=True, encoding='utf8')

            colour = root.find('publishers/publisher/romeocolour')

            if colour is not None:
                a = spineapi.Annotation()
                a['concept'] = 'SherpaRomeo'
                a['property:doi'] = doi
                a['property:name'] = 'Sherpa/RoMEO'
                a['property:sourceDatabase'] = 'sherparomeo'
                a['property:sourceDescription'] = '<p><a href="http://www.sherpa.ac.uk/romeo/">SHERPA/RoMEO</a> provides information about publisher copyright policies for this article.</p>'
                a['property:description'] = "Archiving status is '" +colour.text+ "'."

                explanation = {}
                explanation['green'] = "the author can archive pre-print <em>and</em> post-print or publisher's version/PDF"
                explanation['blue'] = "the author can archive post-print (i.e. final draft post-refereeing) or publisher's version/PDF"
                explanation['yellow'] = "the author can archive pre-print (i.e. pre-refereeing)"
                explanation['white'] = "archiving of this article not formally supported"

                journalTitle = root.find('journals/journal/jtitle')
                publisherName = root.find('publishers/publisher/name')
                publisherURL = root.find('publishers/publisher/homeurl')

                xhtml = "<p>"
                xhtml = xhtml + 'This '+ journalTitle.text + ' article, published by <a href="' + publisherURL.text +'">' + publisherName.text + '</a>, is classified as being <a href="http://www.sherpa.ac.uk/romeo/definitions.php">RoMEO ' + colour.text + '</a>. '
                xhtml = xhtml + 'This means that ' + explanation[colour.text] + '.</p>'
                xhtml = xhtml + '<p>Other <a href="http://www.sherpa.ac.uk/romeo/issn/%s/">details and conditions</a> apply.</p>' % issn
                a['property:xhtml'] = xhtml

                document.addAnnotation(a)
    def on_ready_event(self, document):
        pmid = common.utils.metadata(document, 'pmid')
        if pmid is not None:
            xhtml = ''

            params = {
                'app_id': self.app_id,
                'app_key': self.app_key,
                'i': pmid,
            }
            url = '{0}/DocumentEntitiesService?{1}'.format(self.app_uri, urllib.urlencode(params))
            response = urllib2.urlopen(url, timeout=15).read()
            results = json.loads(response.decode('latin1'))
            if results['RESP_SYS_STATUS'] == 'STAT_OK' and 'RESP_PAYLOAD' in results and len(results['RESP_PAYLOAD']) > 0:
                xhtml += '<h2>Related entities</h2>'
                for entity in results['RESP_PAYLOAD']:
                    xhtml += '<p><strong><a href="{0}">{1}</a></strong> ({2})</p>'.format(entity['bestLink'], entity['entityName'], entity['entityTypeDisplay'])

            params = {
                'app_id': self.app_id,
                'app_key': self.app_key,
                'i': pmid,
                'n': '10',
            }
            url = '{0}/DocumentToNewsService?{1}'.format(self.app_uri, urllib.urlencode(params))
            response = urllib2.urlopen(url, timeout=15).read()
            results = json.loads(response.decode('latin1'))
            if results['RESP_SYS_STATUS'] == 'STAT_OK' and 'RESP_PAYLOAD' in results and len(results['RESP_PAYLOAD']) > 0:
                xhtml += '<h2>Related news</h2>'
                for bite in results['RESP_PAYLOAD']:
                    xhtml += self.renderBite(bite)

            if len(xhtml) > 0:
                a = spineapi.Annotation()
                a['concept'] = 'SciBite'
                a['property:pmid'] = pmid
                a['property:name'] = 'SciBite'
                a['property:sourceDatabase'] = 'scibite'
                a['property:xhtml'] = xhtml
                a['property:description'] = 'Biomedical News & Intelligence'
                a['property:sourceDescription'] = '<p><a href="http://scibite.com/">SciBite</a> scans 1000s of papers, patents, blogs, newsfeeds and more to bring you daily alerts on critical topics in biomedicine.</p>'
                document.addAnnotation(a)
    def on_ready_event(self, document):
        doi = common.utils.metadata(document, 'doi', '')

        # Find and aggregate AGI instances in the document
        matches_by_agi = {}
        for match in document.search(self.agiRegex, spineapi.RegExp + spineapi.WholeWordsOnly):
            agi = match.text()
            matches_by_agi.setdefault(agi, [])
            matches_by_agi[agi].append(match)

        # For each AGI add a new bit of HTML
        if len(matches_by_agi) > 0:
            for agi, matches in matches_by_agi.iteritems():
                html = '''
                <p style="overflow: auto; width: 100%">
                  <strong>{0}</strong>
                  <span style="float: right">{1}</span>
                </p>
                '''

                annotation = spineapi.Annotation()
                annotation['concept'] = 'AGI'
                annotation['property:agi'] = agi
                annotation['property:name'] = 'Plant gene databases'
                annotation['property:description'] = 'American Society of Plant Biologists'
                annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/aspb_logo.png', 'image/png')
                if doi.startswith('10.1104/'):
                    annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/pp_logo.png', 'image/png')
                    annotation['property:description'] = 'From Plant Physiology'
                elif doi.startswith('10.1105/'):
                    annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/tpc_logo.png', 'image/png')
                    annotation['property:description'] = 'From Plant Cell'
                annotation['property:sourceDescription'] = '''
                  <p>
                    The <a href="http://www.aspb.org/">American Society of Plant Biologists</a>
                    have deemed these linked databases important sources of information.
                  </p>
                '''
                annotation.addExtents(matches)
                document.addAnnotation(annotation)
    def on_ready_event(self, document):
        doi = common.utils.metadata(document, 'doi')
        if doi is not None:
            try:
                # Check to see if the DOI is known
                url = 'http://api.altmetric.com/{0}/doi/{2}?key={1}'.format(self.api_version, self.key, doi)
                data = urllib2.urlopen(url, timeout=8).read()
                json.loads(data) # Just check this is possible - throws exception otherwise

                a = spineapi.Annotation()
                a['concept'] = 'Altmetric'
                a['property:doi'] = doi
                a['property:json'] = data
                a['property:name'] = 'Altmetric'
                a['property:description'] = 'Who is talking about this article?'
                a['property:sourceDatabase'] = 'altmetric'
                a['property:sourceDescription'] = '<p>Discover, track and analyse online activity related to this article with <a href="http://www.altmetric.com/">Altmetric</a>.</p>'
                a['session:weight'] = '1'
                a['session:default'] = '1'
                document.addAnnotation(a)
            except (urllib2.URLError, socket.timeout):
                pass
        def add_error(component, method, category=None, message=None, exception=None):
            if exception is not None:
                if isinstance(exception, urllib2.URLError) and isinstance(exception.reason, socket.timeout):
                    exception = exception.reason

                if isinstance(exception, socket.timeout):
                    category = "timeout"
                    message = "The server did not respond"
                elif isinstance(exception, urllib2.HTTPError):
                    category = "server"
                    message = unicode(getattr(exception, "reason", "The server did not respond as expected"))
                elif isinstance(exception, urllib2.URLError):
                    category = "connection"
                    message = unicode(getattr(exception, "reason", "The server could not be found"))
            error = spineapi.Annotation()
            error["concept"] = "Error"
            error["property:component"] = component
            error["property:method"] = method
            error["property:category"] = category
            if message is not None:
                error["property:message"] = message
            document.addAnnotation(error, "errors.metadata")
    def on_ready_event(self, document):
        doi = common.utils.metadata(document, 'doi')
        if doi is None:
            return None
        collection = self._get_collection_by_doi(doi)
        if not collection:
            return None
        image_ids = self._get_collection_images(collection)
        htmls = [
            """<p><a href="{}" title="View collection on NeuroVault">View collection on NeuroVault</a></p>"""
            .format(self.COLLECTIONS_URL % collection['id'])
        ]
        for image_id in image_ids:
            info = self._get_image_metainfo(image_id)
            if not info:
                continue
            info['url'] = self.IMAGES_URL % image_id
            info['image_id'] = image_id
            html = u'''
             <div id="{image_id}" class="box">
              <p>
               <span class="name">{name}</span> /
               <span class="map_type">{map_type}</span> /
               <span class="title"><a href="{url}" title="View in NeuroVault">{description}</a></span>
              </p>
             </div>'''.format(**info)
            htmls.append(html)

        if len(htmls) > 0:
            annotation = spineapi.Annotation()
            annotation['concept'] = 'NeuroVaultReference'
            annotation['property:html'] = ''.join(htmls)
            annotation['property:name'] = 'NeuroVault'
            annotation[
                'property:description'] = 'Publicly available supplementary data'
            annotation['property:sourceDatabase'] = 'neurovault'
            annotation[
                'property:sourceDescription'] = '<p><a href="http://neurovault.org/">Neuro<strong>Vault</strong></a> allows neuroimaging researchers to publish their full resultant statistical maps to  supplement their publications.</p>'
            document.addAnnotation(annotation)
    def on_persist_event(self, document):
        client = kend.client.Client()

        document_id, doi = self._resolve(document)
        if document_id is not None:
            for annotation in document.annotations('PersistQueue'):
                if 'session:volatile' not in annotation:
                    try:
                        ka = kend.converter.Annotation.spineapi2kend(annotation, document_id)
                        ka.context = self._context_

                        updated = client.persistAnnotation(ka, context = self._context_)

                        if isinstance(updated, kend.model.Annotation):
                            for key in ('id', 'created', 'author', 'revision', 'edit', 'media_edit'):
                                annotation[key] = getattr(updated, key)
                            annotation.removePropertyAll('session:media')
                            for media in updated.media:
                                mediaDict = {}
                                for k in ['name', 'src', 'type', 'revision', 'size', 'edit']:
                                    if hasattr(media, k):
                                        mediaDict[k] = getattr(media, k)
                                annotation.insertProperty('session:media', urllib.urlencode(mediaDict))
                            document.removeAnnotation(annotation, 'PersistQueue')
                            document.addAnnotation(annotation)
                    except:
                        raise
                        pass

            for annotation in document.annotations(document.deletedItemsScratchId()):
                try:
                    if 'session:volatile' not in annotation:
                        ka = kend.converter.Annotation.spineapi2kend(annotation, document_id)
                        client.deleteAnnotation(ka)
                    document.removeAnnotation(annotation, document.deletedItemsScratchId())
                except:
                    raise
                    pass
    def on_activate_event(self, document, data={}):
        action = data.get("action")
        domain = data.get("domain")

        if self.annotatedDomains == None:
            self.annotatedDomains = []

        if action == "annotate":
            print "starting 3DM anntotation . . ."
            ns = {"r": "GPCR"}
            pubmedId = common.utils.metadata(document, "pmid")
            if pubmedId == None:
                pubmedId = "0"
            print "sending text to remote server (" + pubmedId + "). . ."
            textMentions = self.getMentions(domain, document.text(), pubmedId)
            print "recieved response, adding annotations for domain " + domain + " . . ."
            objectlist = []
            mention_cache = {}
            for mention in textMentions:
                if mention.mentionType != "SPECIES" and mention.mentionType != "PDB":
                    newData = self.rewriteData(mention)
                    mention.data = newData
                    html, css, js = self.buildHtml(domain, mention)
                    mention.html = html.encode("utf-8")
                    mention.css = css.encode("utf-8")
                    mention.js = js.encode("utf-8")
                    mention_cache.setdefault(mention.html, [])
                    mention_cache[mention.html].append(mention)

            for html, mentions in mention_cache.iteritems():
                annotation = self.createAnnotation(domain, document, html, mentions)
                annotation["displayRelevance"] = "2000"
                annotation["displayRank"] = "2000"
                document.addAnnotation(annotation)

            document.addAnnotation(Annotation(), domain)
            print "done adding annotations."
Example #35
0
    def on_ready_event(self, document):
        # Scrape title and DOI from document
        title = utopia.tools.utils.metadata(document, 'title')
        doi = utopia.tools.utils.metadata(document, 'identifiers[doi]')
        if title is not None or doi is not None:
            # Make metadata link
            link = spineapi.Annotation()
            link['session:volatile'] = '1'
            link['concept'] = 'MetadataSource'
            link['rank'] = '1000'
            link['source'] = 'Content'
            link['listName'] = 'ContentMetadata'
            document.addAnnotation(link)

            # Store actual metadata
            annotation = spineapi.Annotation()
            annotation['session:volatile'] = '1'
            annotation['concept'] = 'DocumentMetadata'
            annotation['property:source'] = 'Content'
            if title is not None:
                annotation['property:title'] = title
            if doi is not None:
                annotation['property:doi'] = doi
            document.addAnnotation(annotation, link['listName'])
    def on_ready_event(self, document):
        doi = common.utils.metadata(document, 'doi')
        if doi is None:
            return None
        collection = self._get_collection_by_doi(doi)
        if not collection:
            return None
        image_ids = self._get_collection_images(collection)
        htmls = [
            """<p><a href="{}" title="View collection on NeuroVault">View collection on NeuroVault</a></p>""".format(self.COLLECTIONS_URL % collection['id'])
            ]
        for image_id in image_ids:
            info = self._get_image_metainfo(image_id)
            if not info:
                continue
            info['url'] = self.IMAGES_URL % image_id
            info['image_id'] = image_id
            html = u'''
             <div id="{image_id}" class="box">
              <p>
               <span class="name">{name}</span> /
               <span class="map_type">{map_type}</span> /
               <span class="title"><a href="{url}" title="View in NeuroVault">{description}</a></span>
              </p>
             </div>'''.format(**info)
            htmls.append(html)

        if len(htmls) > 0:
            annotation = spineapi.Annotation()
            annotation['concept'] = 'NeuroVaultReference'
            annotation['property:html'] = ''.join(htmls)
            annotation['property:name'] = 'NeuroVault'
            annotation['property:description'] = 'Publicly available supplementary data'
            annotation['property:sourceDatabase'] = 'neurovault'
            annotation['property:sourceDescription'] = '<p><a href="http://neurovault.org/">Neuro<strong>Vault</strong></a> allows neuroimaging researchers to publish their full resultant statistical maps to  supplement their publications.</p>'
            document.addAnnotation(annotation)
    def on_ready_event(self, document):
        document_id = utopia.tools.utils.metadata(document,
                                                  'identifiers[utopia]')
        if document_id is not None:

            kwargs = {'document': document_id, 'context': self._context_}
            doi = utopia.tools.utils.metadata(document, 'identifiers[doi]')
            if doi is not None:
                kwargs['doi'] = doi
            annotations = kend.client.Client().annotations(**kwargs)

            link = None
            if annotations is not None:
                for group in annotations:
                    for ann in group.annotations:
                        a = kend.converter.Annotation.kend2spineapi(
                            ann, document)
                        if a.get(
                                'author'
                        ) == 'http://utopia.cs.manchester.ac.uk/users/11679':
                            if a.get('concept') in ("DocumentMetadata",
                                                    "AuthorAffiliation",
                                                    "DocumentReference"):
                                if link is None:
                                    link = document.newAccList('metadata', 100)
                                    link[
                                        'property:sourceDatabase'] = 'biochemj'
                                    link[
                                        'property:sourceTitle'] = 'The Semantic Biochemical Journal'
                                    link[
                                        'property:sourceDescription'] = '<p>Made available by <a href="http://www.portlandpress.com/">Portland Press Limited</a> as part of the <a href="http://www.biochemj.org/bj/semantic_faq.htm">Semantic Biochemical Journal</a>.'

                                # Modify Bibliography Entries
                                if a.get('concept') == 'Citation':
                                    for keyTo, keyFrom in {
                                            'property:title':
                                            'property:articleTitle',
                                            'property:authors':
                                            'property:articleAuthors',
                                            'property:year':
                                            'property:articleYear',
                                            'property:volume':
                                            'property:articleVolume',
                                            'property:source':
                                            'property:journalTitle',
                                    }.iteritems():
                                        if keyFrom in a:
                                            a[keyTo] = a[keyFrom]
                                        a['property:sourceDatabase'] = 'biochemj'
                                        a['property:sourceDescription'] = 'Thingy'

                                document.addAnnotation(a, link['scratch'])
                            else:
                                document.addAnnotation(a)
                        else:
                            document.addAnnotation(a)
    def on_ready_event(self, document):
        document_id, doi = self._resolve(document)
        if document_id is not None:
            kwargs = { 'document': document_id, 'context': self._context_ }
            if doi is not None:
                kwargs['doi'] = doi
            annotations = kend.client.Client().annotations(**kwargs)

            link = None
            if annotations is not None:
                for group in annotations:
                    for ann in group.annotations:
                        a = kend.converter.Annotation.kend2spineapi(ann, document)
                        if a.get('author') == 'http://utopia.cs.manchester.ac.uk/users/11679':
                            if a.get('concept') in ("DocumentMetadata", "AuthorAffiliation", "DocumentReference"):
                                if link is None:
                                    link = document.newAccList('metadata', 100)
                                    link['property:sourceDatabase'] = 'biochemj'
                                    link['property:sourceTitle'] = 'The Semantic Biochemical Journal'
                                    link['property:sourceDescription'] = '<p>Made available by <a href="http://www.portlandpress.com/">Portland Press Limited</a> as part of the <a href="http://www.biochemj.org/bj/semantic_faq.htm">Semantic Biochemical Journal</a>.'

                                # Modify Bibliography Entries
                                if a.get('concept') == 'DocumentReference':
                                    for keyTo, keyFrom in {
                                                'property:title': 'property:articleTitle',
                                                'property:authors': 'property:articleAuthors',
                                                'property:year': 'property:articleYear',
                                                'property:volume': 'property:articleVolume',
                                                'property:source': 'property:journalTitle',
                                            }.iteritems():
                                        if keyFrom in a:
                                            a[keyTo] = a[keyFrom]
                                        a['property:sourceDatabase'] = 'biochemj'
                                        a['property:sourceDescription'] = 'Thingy'

                                document.addAnnotation(a, link['scratch'])
                            else:
                                document.addAnnotation(a)
                        else:
                            document.addAnnotation(a)
    def on_ready_event(self, document):
        doi = common.utils.metadata(document, 'doi')
        if doi is not None:
            page = 1
            items_retrieved = 0
            api_search_url = 'http://api.figshare.com/v1/articles/search?'
            query = {'search_for': doi, 'has_link': doi}

            htmls = []

            while True:
                query['page'] = page

                #handler = oauth_auth.HTTPOauthAuthHandler()
                #consumer = oauth.KeySecret('yeiB61W0PYaUGPhhi8pBhA', 'LTOVWR94y8YZwscJhrFg0w')
                #token = oauth.KeySecret('jsWfvZBLPgNMRMjFaQOMbgAst4Rh5LzWmTMDD4HkHOpAjsWfvZXLPgNMRMjFaQOMbg', 'm8iqNc7AQH9Yrqa6e0H5AA')
                #handler.add_password(None, 'api.figshare.com', consumer, token)
                #opener = urllib2.build_opener(handler)

                #print api_search_url + urllib.urlencode(query)
                #response = opener.open(api_search_url + urllib.urlencode(query)).read()
                response = urllib2.urlopen(api_search_url + urllib.urlencode(query), timeout=8).read()
                data = json.loads(response)

                items = data.get('items', [])
                items_found = int(data.get('items_found', 0))

                # Bail if no items found
                if len(items) == 0 or items_found <= 0:
                    break

                # Bail after ten pages of stuff
                if page > 10:
                    break

                items_retrieved += len(items)

                for item in items:
                    title = flatten(item.get('title'))
                    description = flatten(item.get('description'))
                    links = item.get('links', [])
                    url = item.get('url')
                    item_doi = item.get('DOI')
                    article_id = item.get('article_id')
                    authors = item.get('authors', [])
                    published_date = item.get('published_date')
                    type = item.get('type')

                    html = u'''
                      <div id="{article_id}" class="box">
                        <p>
                          <span class="title">{title}</span>
                          <span class="authors">{authors}</span>
                          <a href="{url}" title="Explore FigShare">[Link]</a>
                        </p>
                        <p class="readmore">
                          {description}
                        </p>
                      </div>
                    '''.format(**{
                        'article_id': article_id,
                        'title': title,
                        'url': item_doi,
                        'description': description,
                        'authors': u', '.join((author['author_name'] for author in authors))
                    })

                    htmls.append(html)

                # Stop if we've retrieved the number of items expected
                if items_retrieved >= items_found:
                    break

                page += 1

            if len(htmls) > 0:
                annotation = spineapi.Annotation()
                annotation['concept'] = 'FigShareReference'
                annotation['property:html'] = ''.join(htmls)
                annotation['property:name'] = 'FigShare'
                annotation['property:description'] = 'Publicly available supplementary material'
                annotation['property:sourceDatabase'] = 'figshare'
                annotation['property:sourceDescription'] = '<p><a href="http://figshare.com/">fig<strong>share</strong></a> allows researchers to publish all of their research outputs in seconds in an easily citable, sharable and discoverable manner.</p>'
                document.addAnnotation(annotation)
    def on_ready_event(self, document):

        doi = utopialib.utils.metadata(document, 'identifiers[doi]')
        if doi is not None:
            info = {}

            # Resolve the DOI to find the publisher's website
            response = urllib2.urlopen('http://dx.doi.org/{0}'.format(doi),
                                       timeout=8)

            # Parse page to find (if there) the full text URL
            parser = etree.HTMLParser()
            html = etree.parse(response, parser)

            # Only continue if this is a highwire HTML page
            if len(html.xpath("/html/head/meta[@name='HW.identifier']")) == 0:
                return

            # Now make sure we have the full text XHTML
            citation_fulltext_html_url = html.xpath(
                "/html/head/meta[@name='citation_fulltext_html_url']/@content")
            if len(citation_fulltext_html_url) > 0:
                citation_fulltext_html_url = citation_fulltext_html_url[0]

                # Fetch that full text page (if different to the current one)
                if citation_fulltext_html_url != response.geturl():
                    response = urllib2.urlopen(citation_fulltext_html_url,
                                               timeout=8)
                    html = etree.parse(response, parser)

                #print etree.tostring(html, pretty_print=True, encoding='utf8')

                # Now parse out the bibliography
                info['citations'] = []
                info['citations_by_id'] = {}

                for bibitem in html.xpath(
                        "//div[contains(concat(' ', normalize-space(@class), ' '), ' ref-list ')]//ol[contains(concat(' ', normalize-space(@class), ' '), ' cit-list ')]/li"
                ):
                    citation = query(
                        bibitem, {
                            'id':
                            'a/@id',
                            'label':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' ref-label ')]/text()",
                            'title':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-article-title ')]/text()",
                            'year':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-date ')]/text()",
                            'publication-title':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-jnl-abbrev ')]/text()",
                            'volume':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-vol ')]/text()",
                            'issue':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-issue ')]/text()",
                            'pagefrom':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-fpage ')]/text()",
                            'pageto':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-lpage ')]/text()",
                            'pmid':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-pmid ')]/text()",
                            'doi':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-doi ')]/text()",
                            'etree':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-metadata ')]",
                        })
                    authors = []
                    for a in bibitem.xpath(
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-auth ')]"
                    ):
                        surname = a.xpath(
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-surname ')]/text()"
                        )
                        given_names = a.xpath(
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-given-names ')]/text()"
                        )
                        if len(surname) > 0 and len(given_names) > 0:
                            authors.append(u'{0}, {1}'.format(
                                surname[0], given_names[0]).strip(', '))
                    if len(authors) > 0:
                        citation['authors'] = authors
                    citation['contexts'] = []
                    citation['displayText'] = utopia.citation.format(citation)

                    info['citations'].append(citation)
                    info['citations_by_id'][citation['id']] = citation
                    #print citation

                #######################################################################################
                # Parse in-text citations if present

                min_length = 10
                max_length = 20
                for paragraph in html.xpath(
                        "//div[contains(concat(' ', normalize-space(@class), ' '), ' article ')]//p"
                ):
                    text_stack = [paragraph.text or '']
                    xref_stack = [None]
                    for elem in paragraph:
                        if len(elem.xpath('self::a[@class="xref-bibr"]')) > 0:
                            text_stack.append(
                                etree.tostring(elem,
                                               method='text',
                                               encoding=unicode,
                                               with_tail=False))
                            text_stack.append(elem.tail or '')
                            xref = info['citations_by_id'].get(
                                elem.get('href', '')[1:])
                            if xref is not None:
                                xref_stack += [[xref], None]
                            else:
                                xref_stack += [[], None]
                        elif isinstance(elem, etree._Entity):
                            points = entities.get(elem.text[1:-1])
                            if points is not None:
                                text_stack[-1] += ''.join(
                                    (unichr(p) for p in points))
                            else:
                                text_stack[-1] += etree.tostring(
                                    elem, encoding=unicode)
                        else:
                            if elem.get('position') == 'float':
                                text_stack[-1] += elem.tail or ''
                            else:
                                text_stack[-1] += etree.tostring(
                                    elem, method='text', encoding=unicode)
                    # Find and collapse ranges in the text
                    for i in xrange(len(xref_stack) - 3, 1, -2):
                        text = text_stack[i].strip()
                        #print len(text), (text in u'-\u2010\u2011\u2012\u2013\u2014\u2015'), u''.join(text_stack[i-1:i+2]).encode('utf8')
                        # if this text is a dash, we need to coalesce the text fragments
                        if len(
                                text
                        ) == 1 and text in u'-\u2010\u2011\u2012\u2013\u2014\u2015':
                            text_stack[i - 1:i + 2] = [
                                u''.join(text_stack[i - 1:i + 2])
                            ]
                            xref_stack[i - 1:i + 2] = [
                                xref_stack[i - 1] + xref_stack[i + 1]
                            ]
                    #for text in text_stack:
                    #    print text.encode('utf8')
                    # Then make sure we resolve the implied citations
                    for i in xrange(1, len(xref_stack), 2):
                        # Get actual cross references
                        xrefs = xref_stack[i]

                        # Expand cross references
                        try:
                            if len(xrefs) == 2:
                                labelfrom = int(xrefs[0].get('label'))
                                labelto = int(xrefs[1].get('label'))
                                candidates = {}
                                midlabels = [
                                    unicode(midlabel) for midlabel in xrange(
                                        labelfrom + 1, labelto)
                                ]
                                for candidate in info['citations']:
                                    if candidate.get('label') in midlabels:
                                        candidates[int(candidate.get(
                                            'label'))] = candidate
                                xrefs[1:-1] = candidates.values()
                        except:
                            raise
                    # Find and collapse lists in the text
                    for i in xrange(len(xref_stack) - 3, 1, -2):
                        text = text_stack[i].strip()
                        # if this text is a comma, we need to coalesce the text fragments
                        if len(text) == 1 and text == ',':
                            text_stack[i - 1:i + 2] = [
                                u''.join(text_stack[i - 1:i + 2])
                            ]
                            xref_stack[i - 1:i + 2] = [
                                xref_stack[i - 1] + xref_stack[i + 1]
                            ]
                    # Expand citations to include brackets (on both sides)
                    for i in xrange(len(xref_stack) - 2, 0, -2):
                        before = text_stack[i - 1].strip()[-1:]
                        text = text_stack[i].strip()
                        after = text_stack[i + 1].strip()[:1]
                        # if this text is a comma, we need to coalesce the text fragments
                        #print before.encode('utf'), after.encode('utf')
                        if len(before) > 0 and before in '({[' and len(
                                after) > 0 and after in ')}]':
                            text_stack[i - 1] = re.sub(r'[({[](\s*)$', r'\1',
                                                       text_stack[i - 1])
                            text_stack[i + 1] = re.sub(r'^(\s*)[)}\]]', r'\1',
                                                       text_stack[i + 1])
                            text_stack[i] = before + text_stack[i] + after
                    #print repr(text_stack)
                    for i in xrange(1, len(xref_stack), 2):
                        # Get context
                        before = u' '.join(text_stack[:i]).strip()
                        label = text_stack[i].strip()
                        after = u' '.join(text_stack[i + 1:]).strip()
                        # Strip out extraneous brackets
                        if len(
                                xref_stack[i]
                        ) > 1:  # Hack to differentiate single / multiple citations
                            # as multiple numbers tend not to have spaces between them
                            label = re.sub(
                                ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,])\s?[({[]?',
                                r'\1', label)
                        else:
                            label = re.sub(
                                ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,]\s?)[({[]?',
                                r'\1', label)
                        # Normalise context
                        before = re.sub(r'\s+', ' ',
                                        before)[-max_length:].strip()
                        label = re.sub(r'\s+', ' ', label)
                        after = re.sub(r'\s+', ' ', after)[:max_length].strip()
                        #print (before.encode('utf8'), label.encode('utf8'), after.encode('utf8'))
                        if len(before + after) > min_length:
                            for xref in xref_stack[i]:
                                xref['contexts'].append((before, label, after))
                        #print xref_stack[i]

                #######################################################################################
                # Parse tables if present

                info['tables'] = {}
                for table_url in html.xpath(
                        "//div[contains(concat(' ', normalize-space(@class), ' '), ' table-inline ')]/div[contains(concat(' ', normalize-space(@class), ' '), ' callout ')]//li[1]/a/@href"
                ):
                    table_url = urlparse.urljoin(citation_fulltext_html_url,
                                                 table_url)
                    #print table_url
                    response = urllib2.urlopen(table_url, timeout=8)
                    table_html = etree.parse(response, parser)
                    for table_expansion in table_html.xpath(
                            "//div[contains(concat(' ', normalize-space(@class), ' '), ' table-expansion ')]"
                    ):
                        id = table_expansion.get('id')
                        table = {}
                        table['xml'] = table_expansion.xpath('.//table[1]')[0]
                        table['caption_raw'] = table_expansion.xpath(
                            ".//span[contains(concat(' ', normalize-space(@class), ' '), ' caption-title ')][1]"
                        )[0]
                        if 'caption' not in table and 'caption_raw' in table:
                            table['caption'] = table['caption_raw']
                        if 'caption' in table:
                            table['caption'] = re.sub(
                                r'\s+', ' ',
                                etree.tostring(table['caption'],
                                               method='text',
                                               encoding=unicode).strip())
                        if 'xml' in table:
                            table['xml'] = etree.tostring(table['xml'],
                                                          encoding='utf8')
                        info['tables'][id] = table

                        #print table

            #print info
            if info is not None and len(info) > 0:
                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True,
                                         recover=True,
                                         remove_blank_text=True,
                                         encoding='utf8')
                pmids = dict(((citation['pmid'], citation['id'])
                              for citation in info['citations']
                              if 'pmid' in citation and 'id' in citation))
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(
                        utopialib.eutils.efetch(id=','.join(pmids.keys()),
                                                retmode='xml',
                                                rettype='abstract'), parser)
                    for idList in pubmed_abstracts.xpath(
                            'PubmedArticle/PubmedData/ArticleIdList'):
                        #print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info['citations_by_id'][pmids[pmid]]
                            for key_name, id_name in (('doi', 'doi'), ('pmcid',
                                                                       'pmc'),
                                                      ('pii', 'pii')):
                                id = idList.findtext(
                                    'ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    #print 'KEY', key_name, id

                # Generate sensible titles / descriptions / icons?
                journalTitle = info.get('publication-title', '')
                journalTitleSuffix = ''
                publisher = info.get('publisher', 'the publisher')
                if len(journalTitle) > 0:
                    journalTitleSuffix = ' ({0})'.format(journalTitle)

                # Create Metadata link annotation
                link = document.newAccList('metadata', 90)
                link['property:sourceIcon'] = ''
                link['property:sourceTitle'] = publisher
                link['property:sourceDescription'] = '''
                    <p>This information was provided by {0}{1}.</p>
                    '''.format(publisher, journalTitleSuffix)

                # Create Metadata annotation
                annotation = utopialib.utils.citation_to_annotation(
                    info.get('self', {}), 'DocumentMetadata')
                document.addAnnotation(annotation, link['scratch'])

                # Create Bibliography annotations
                for citation in info.get('citations', []):
                    annotation = utopialib.utils.citation_to_annotation(
                        citation)
                    document.addAnnotation(annotation, link['scratch'])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info['citations']:
                    # Find cross refs
                    for pre, label, post in citation.get('contexts', []):
                        #print (pre, label, post)
                        matches = document.findInContext(pre, label, post)
                        #print matches
                        if len(matches) > 0:
                            try:
                                annotation = spineapi.Annotation()
                                annotation = utopialib.utils.citation_to_annotation(
                                    citation, concept='ForwardCitation')
                                if 'doi' in citation and citation[
                                        'doi'].startswith('10.1371/'):
                                    citation[
                                        'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format(
                                            'info:doi/{0}'.format(
                                                citation['doi']))
                                if 'pmcid' in citation:
                                    citation[
                                        'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(
                                            citation['pmcid'])
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation,
                                                       link['scratch'])
                            except:
                                raise

                for id, table in info.get('tables', {}).iteritems():
                    if 'caption' in table and 'xml' in table:
                        regex = fuzz(table['caption'], strict=True)
                        #print regex

                        # convert oasis tables
                        ns = {
                            'oasis':
                            'http://docs.oasis-open.org/ns/oasis-exchange/table'
                        }
                        xml = etree.fromstring(table['xml'])
                        if xml.tag == '{{{0}}}table'.format(ns['oasis']):
                            for tgroup in xml.xpath('//oasis:tgroup',
                                                    namespaces=ns):
                                columns = {}
                                for colspec in tgroup.xpath('./oasis:colspec',
                                                            namespaces=ns):
                                    columns[colspec.get('colname')] = int(
                                        colspec.get('colnum'))
                                for section in tgroup.xpath(
                                        './oasis:thead|./oasis:tbody',
                                        namespaces=ns):
                                    isHead = (
                                        section.tag == '{{{0}}}thead'.format(
                                            ns['oasis']))
                                    for row in section.xpath('./oasis:row',
                                                             namespaces=ns):
                                        for entry in row.xpath('./oasis:entry',
                                                               namespaces=ns):
                                            colname = entry.get('colname')
                                            colst = entry.get('namest')
                                            colend = entry.get('nameend')
                                            if colst is not None and colend is not None:
                                                colspan = columns[
                                                    colend] - columns[colst] + 1
                                            else:
                                                colspan = 1
                                            if colspan > 1:
                                                entry.set(
                                                    'colspan',
                                                    unicode(colspan))
                                            morerows = entry.get('morerows')
                                            if morerows is not None:
                                                rowspan = int(morerows) + 1
                                            else:
                                                rowspan = 1
                                            if rowspan > 1:
                                                entry.set(
                                                    'rowspan',
                                                    unicode(rowspan))
                                            entry.tag = 'td'
                                        row.tag = 'tr'
                                    if isHead:
                                        section.tag = 'thead'
                                    else:
                                        section.tag = 'tbody'
                                    xml.append(section)
                                xml.tag = 'table'
                                #print etree.tostring(xml, pretty_print=True, encoding='utf8')
                                table['xml'] = etree.tostring(xml,
                                                              encoding='utf8')

                        matches = document.search(
                            regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation['concept'] = 'Table'
                            annotation[
                                'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(
                                    table['xml'])
                            annotation['session:volatile'] = '1'
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link['scratch'])
                        else:
                            print '*********** failed to match table:', id
Example #41
0
    def after_load_event(self, document):
        # Put errors together in a sensible way
        errors = {}
        failures = 0
        successes = 0
        for error in document.annotations('errors.metadata'):
            if error.get('concept') == 'Success':
                successes += 1
            elif error.get('concept') == 'Error':
                failures += 1

            component = error.get('property:component')
            errors.setdefault(component, {})

            category = error.get('property:category')
            errors[component].setdefault(category, [])

            method = error.get('property:method')
            message = error.get('property:message', '')
            errors[component][category].append((method, message))
        categories = {}
        for component, details in errors.iteritems():
            for category in details.keys():
                categories.setdefault(category, 0)
                categories[category] += 1

        # If there are errors, provide feedback to the user
        if failures > 0:
            # Check for likely client problems
            if categories.get('connection', 0) == failures and successes == 0:
                summary = '''
                    Utopia could not reach any of the online services it would
                    normally use to identify this document, meaning you are
                    likely to see limited or no information below. You might
                    wish to check your Internet connection and reload the
                    document.
                    '''
            elif categories.get('timeout', 0) > 1:
                if categories.get('timeout', 0) == failures and successes == 0:
                    many = ''
                else:
                    many = 'some of'
                summary = '''
                    Utopia gave up contacting {0} the online services it would
                    normally use to identify this document because they were
                    taking too long to respond. You are likely to see limited
                    or no information below. You might wish to check your
                    Internet connection and reload the document.
                    '''.format(many)
            else:
                if failures == 1:
                    noun = 'An error'
                else:
                    noun = 'Errors'
                summary = '''
                    {0} occurred when trying to discover the identity
                    of this document. You are likely to see limited or no
                    information below.
                    '''.format(noun)
            html = '''
                <div class="box error">
                    <strong>Warning</strong>
                    <p>
                        {0}
                    </p>
                    <div class="expandable" title="Details...">
                    <ul>
            '''.format(summary)
            for component, details in errors.iteritems():
                for category, methods in details.iteritems():
                    if category != 'success':
                        summary = {
                            'timeout': '{0} did not respond',
                            'connection': 'Could not connect to {0}',
                            'server': '{0} behaved unexpectedly',
                        }.get(category, 'An error occurred accessing {0}')
                        methods_html = ', '.join(
                            ('<span title="{1}">{0}</span>'.format(
                                method, message)
                             for method, message in methods))
                        html += '<li>{0} (when accessing: {1}).</li>'.format(
                            summary.format('<strong>' + component +
                                           '</strong>'), methods_html)
            html += '''
                    </ul>
                    </div>
                <div>
            '''
            annotation = spineapi.Annotation()
            annotation['concept'] = 'Collated'
            annotation['property:html'] = html
            annotation['property:name'] = 'Error'
            annotation['session:weight'] = '1000'
            annotation['session:default'] = '1'
            annotation['session:headless'] = '1'
            document.addAnnotation(annotation)

        print errors
    def _populate(self, document):
        # Start by seeing what is already known about this document
        nlm = common.utils.metadata(document, 'raw_pmc_nlm')
        doi = common.utils.metadata(document, 'doi')

        if nlm is not None:
            info = self.JournalXMLParser(nlm)

            try:
                nlmdoi = info.articleDOI().lower()
            except: # FIXME which exception(s)?
                nlmdoi = None
                print "PMC returned nothing"

            if doi != nlmdoi:
                print "PMC returned wrong article:", info.articleDOI()
            else:
                print "PMC returned information about article:", info.articleTitle()

                link = document.newAccList('metadata')
                link['property:sourceDatabase'] = 'pubmed'
                link['property:sourceTitle'] = 'PubMed'
                link['property:sourceDescription'] = '<p><a href="http://www.ncbi.nlm.nih.gov/pubmed/">PubMed</a> comprises more than 21 million citations for biomedical literature from MEDLINE, life science journals, and online books.</p>'

                annotation = spineapi.Annotation()
                annotation['concept'] = 'DocumentMetadata'

                # print nlm.articlePublicationDate('epub')
                # print nlm.articlePublicationDate('epreprint')
                # print nlm.journalISSN('epub')

                annotation["property:identifier"] = 'info:doi%s' % info.articleDOI()
                annotation["property:source"] = 'Publisher/NLM'
                annotation["property:curatedBy"] = "PMC"

                annotation["property:journalTitle"] = info.journalTitle()
                annotation["property:journalPublisher"] = info.journalPublisher()
                annotation["property:journalISSN"] = info.journalISSN()
                annotation["property:articleAuthors"] = info.articleAuthors()
                annotation["property:articleTitle"] = info.articleTitle()
                annotation["property:articleDOI"] = info.articleDOI()
                annotation["property:articlePMID"] = info.articlePMID()
                annotation["property:articlePublisherID"] = info.articlePublisherID()
                annotation["property:articlePublicationDate"] = info.articlePublicationDate()
                annotation["property:articleVolume"] = info.articleVolume()
                annotation["property:articleIssue"] = info.articleIssue()
                if info.articlePages() is not None:
                    annotation["property:articlePages"] = "%s-%s" % info.articlePages()
                annotation["property:articleAbstract"] = info.articleAbstract()
                annotation["property:articleKeywords"] = info.articleKeywords()
                annotation["property:articleAbbreviations"] = info.articleAbbreviations()

                document.addAnnotation(annotation, link['scratch'])

                # FIXME: Annotation properties need to be lists
                for surname, forename, aff in info.articleAuthorAffiliationList():
                    annotation = spineapi.Annotation()
                    annotation['concept'] = "AuthorAffiliation"
                    annotation["property:curatedBy"] = "PMC"
                    annotation["property:authorSurname"] = surname
                    annotation["property:authorForename"] = forename
                    annotation["property:articleAuthor"] = "%s, %s" %(surname, forename)
                    annotation["property:affiliation"] = aff
                    document.addAnnotation(annotation, link['scratch'])

                for ref in info.articleReferenceList():
                    annotation = spineapi.Annotation()
                    annotation['concept'] = "DocumentReference"
                    if 'doi' in ref:
                        annotation["property:doi"] = ref['doi']
                    if 'pmid' in ref:
                        annotation["property:pmid"] = ref['pmid']
                    if 'title' in ref:
                        annotation["property:title"] = ref['title']
                    if 'label' in ref:
                        annotation["property:label"] = ref['label']
                    if 'authors' in ref:
                        annotation["property:authors"] = ref['authors']
                    if 'editors' in ref:
                        annotation["property:articleEditors"] = ref['editors']
                    if 'publication-title' in ref:
                        annotation["property:publication-title"] = ref['publication-title']
                    if 'type' in ref:
                        annotation["property:publicationType"] = ref['type']
                    if 'volume' in ref:
                        annotation["property:volume"] = ref['volume']
                    if 'issue' in ref:
                        annotation["property:issue"] = ref['issue']
                    if 'publisher' in ref:
                        annotation["property:publisher"] = ref['publisher']
                    if 'fpage' in ref and 'lpage' in ref:
                        annotation["property:pages"] = "%s-%s" % (ref['fpage'],ref['lpage'])
                    if 'year' in ref:
                        annotation["property:year"] = ref['year']

                    document.addAnnotation(annotation, link['scratch'])
    def on_ready_event(self, document):
        # Find distinguishing ID
        pmid = common.utils.metadata(document, 'pmid')

        # Compile distinct GEO IDs in the text
        matches = {}
        for match in document.search(r'GSE\d+', spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp):
            matches.setdefault(match.text(), [])
            matches[match.text()].append(match)

        # Create annotations for each GEO ID
        for gse, extents in matches.iteritems():
            entry = self._fetchGEO(gse)

            dates = u'Submitted {0}'.format(entry['submission_date'])
            if 'last_update_date' in entry:
                dates += u'; last updated {0}'.format(entry['last_update_date'])
            dates += '.'

            dataCitation=u'''<p>{0}. <strong>{1}</strong>.</p><p>{2}<br>({3})</p><p>{4}</p>'''.format(
                entry['contributors'].decode('utf8'),  entry['title'].decode('utf8'), entry['overall_design'].decode('utf8'), entry['type'].decode('utf8'), dates)

            xhtml = u'<div class="box">{0}{{0}}<p>GEO Accession: <a href="{1}">{2}</a></p></div>'.format(
                dataCitation, entry['GEO_url'].decode('utf8'), gse)

            xhtml += u'<p><a href="{0}">Explore in InSilico DB...</a></p>'.format(entry['InSilicoDB_url'])

            srcdesc='''<p>The <a href="http://www.ncbi.nlm.nih.gov/geo">Gene
                       Expression Omnibus (GEO)</a> is a public repository
                       that archives and freely distributes microarray,
                       next-generation sequencing, and other forms of
                       high-throughput functional genomic data submitted
                       by the scientific community.</p>'''

            if entry.get('pubmed_id') == pmid:
                # add a global annotation
                annotation = spineapi.Annotation()
                annotation['concept'] = 'GEO'
                annotation['property:name'] = 'Gene Expression Omnibus'
                annotation['property:sourceDatabase'] = 'geo'
                annotation['property:description'] = '{0} (Data associated with this article)'.format(gse)
                annotation['property:sourceDescription'] = srcdesc
                annotation['property:xhtml'] = xhtml.format('') # Keep summary blank
                document.addAnnotation(annotation)

            # Generate summary
            summary = entry.get('summary', '')
            if len(summary) > 0:
                summary_words = summary.split(' ')
                summary = u'<p><em>Summary:</em> '
                summary += u'{0}'.format(' '.join(summary_words[:32]))
                if len(summary_words) > 32:
                    summary += u' <span class="readmore">{0}</span>'.format(' '.join(summary_words[32:]))
                summary += u'</p>'

            # local annotation
            annotation = spineapi.Annotation()
            annotation['concept'] = 'GEO'
            annotation['property:name'] = 'Gene Expression Omnibus'
            annotation['property:sourceDatabase'] = 'geo'
            annotation['property:description'] = gse
            annotation['property:sourceDescription'] = srcdesc
            annotation['property:xhtml'] = xhtml.format(summary)
            for extent in extents:
                annotation.addExtent(extent)
            document.addAnnotation(annotation)
Example #44
0
    def after_ready_event(self, document):
        print 'Formatting metadata'

        # Find highest matching metadata accumulation list for references
        source = None
        for accListLink in document.getAccLists('metadata'):
            matches = document.annotationsIf({'concept': 'Citation'},
                                             accListLink['scratch'])
            if len(matches) > 0:
                print 'Selected for [Citation] list %s with rank %s' % (
                    accListLink['scratch'], repr(accListLink.get('rank', 0)))
                source = accListLink
                bibliography = list(matches)
                bibliography.sort(key=sortfn)
                rt = ''
                for annotation in bibliography:
                    citation = utopia.tools.utils.citation_from_annotation(
                        annotation)
                    rt += utopia.citation.render(citation, links=True)

                if len(bibliography) > 0:
                    # Create Metadata link annotation
                    link = document.newAccList('citation_list')
                    link['property:list_name'] = 'Bibliography'
                    document.addAnnotations(bibliography, link['scratch'])

                if len(rt) > 0:
                    references = spineapi.Annotation()
                    references['displayBibliography'] = rt
                    references['concept'] = 'BibliographyMetadata'
                    references['property:identifier'] = '#bibliography'
                    references['property:name'] = 'Bibliography'
                    references['displayName'] = 'Bibliography'
                    references['displayRelevance'] = '800'
                    if accListLink is not None:
                        for i in ('sourceIcon', 'sourceTitle',
                                  'sourceDescription', 'sourceDatabase'):
                            k = 'property:{0}'.format(i)
                            if k in accListLink:
                                references[k] = accListLink[k]
                        references[
                            'property:description'] = 'From ' + accListLink[
                                'property:sourceTitle']
                    document.addAnnotation(references)
                break
        if source is None:
            print 'No metadata found'

        # Find highest matching metadata accumulation list for in-text citations
        for accListLink in document.getAccLists('metadata'):
            matches = document.annotationsIf({'concept': 'ForwardCitation'},
                                             accListLink['scratch'])
            if len(matches) > 0:
                print 'Selected for [ForwardCitation] list %s with rank %s' % (
                    accListLink['scratch'], repr(accListLink.get('rank', 0)))
                document.addAnnotations(matches)
                break

        # Find highest matching metadata accumulation list for in-text citations
        for accListLink in document.getAccLists('metadata'):
            matches = document.annotationsIf({'concept': 'Table'},
                                             accListLink['scratch'])
            if len(matches) > 0:
                print 'Selected for [Table] list %s with rank %s' % (
                    accListLink['scratch'], repr(accListLink.get('rank', 0)))
                document.addAnnotations(matches)
                break

        metadata = None
        if source is not None:
            for annotation in document.annotations(source['scratch']):
                if annotation.get('concept') == 'DocumentMetadata':
                    metadata = annotation
            if metadata:
                metadata['displayName'] = 'Document Information'
                metadata['displayRelevance'] = '1000'
                document.addAnnotation(metadata, 'Document Metadata')
Example #45
0
    def on_ready_event(self, document):
        # Find distinguishing ID
        pmid = utopia.tools.utils.metadata(document, 'identifiers[pubmed]')

        # Compile distinct GEO IDs in the text
        matches = {}
        for match in document.search(
                r'GSE\d+', spineapi.IgnoreCase + spineapi.WholeWordsOnly +
                spineapi.RegExp):
            matches.setdefault(match.text(), [])
            matches[match.text()].append(match)

        # Create annotations for each GEO ID
        for gse, extents in matches.iteritems():
            entry = self._fetchGEO(gse)

            print entry

            dates = u'Submitted {0}'.format(entry['submission_date'])
            if 'last_update_date' in entry:
                dates += u'; last updated {0}'.format(
                    entry['last_update_date'])
            dates += '.'

            dataCitation = u'''<p>{0}. <strong>{1}</strong>.</p><p>{2}<br>({3})</p><p>{4}</p>'''.format(
                entry['contributors'], entry['title'],
                entry.get('overall_design', ''), entry['type'], dates)

            xhtml = u'<div class="box">{0}{{0}}<p>GEO Accession: <a href="{1}">{2}</a></p></div>'.format(
                dataCitation, entry['GEO_url'], gse)

            # Removed broken InSilicoDB link
            #xhtml += u'<p><a href="{0}">Explore in InSilico DB...</a></p>'.format(entry['InSilicoDB_url'])

            srcdesc = '''<p>The <a href="http://www.ncbi.nlm.nih.gov/geo">Gene
                       Expression Omnibus (GEO)</a> is a public repository
                       that archives and freely distributes microarray,
                       next-generation sequencing, and other forms of
                       high-throughput functional genomic data submitted
                       by the scientific community.</p>'''

            if entry.get('pubmed_id') == pmid:
                # add a global annotation
                annotation = spineapi.Annotation()
                annotation['concept'] = 'GEO'
                annotation['property:name'] = 'Gene Expression Omnibus'
                annotation['property:sourceDatabase'] = 'geo'
                annotation[
                    'property:description'] = '{0} (Data associated with this article)'.format(
                        gse)
                annotation['property:sourceDescription'] = srcdesc
                annotation['property:xhtml'] = xhtml.format(
                    '')  # Keep summary blank
                document.addAnnotation(annotation)

            # Generate summary
            summary = entry.get('summary', '')
            if len(summary) > 0:
                summary_words = summary.split(' ')
                summary = u'<p><em>Summary:</em> '
                summary += u'{0}'.format(' '.join(summary_words[:32]))
                if len(summary_words) > 32:
                    summary += u' <span class="readmore">{0}</span>'.format(
                        ' '.join(summary_words[32:]))
                summary += u'</p>'

            # local annotation
            annotation = spineapi.Annotation()
            annotation['concept'] = 'GEO'
            annotation['property:name'] = 'Gene Expression Omnibus'
            annotation['property:sourceDatabase'] = 'geo'
            annotation['property:description'] = gse
            annotation['property:sourceDescription'] = srcdesc
            annotation['property:xhtml'] = xhtml.format(summary)
            for extent in extents:
                annotation.addExtent(extent)
            document.addAnnotation(annotation)
    def on_ready_event(self, document):
        volume, page = None, None

        # Only send if the DOI has a Portland prefix
        doi = common.utils.metadata(document, 'doi')
        if doi is not None and doi[:7] in registrants:
            crossref_unixref = common.utils.metadata(document, 'raw_crossref_unixref')
            if crossref_unixref is not None:
                # Parse CrossRef redirect URL
                dom = etree.fromstring(crossref_unixref.encode('utf8'))
                resource = dom.findtext('doi_record/crossref/journal/journal_article/doi_data/resource')
                if resource is not None:
                    match = self.resourceRegExp.match(resource)
                    if match is not None:
                        volume, page = match.groups()

                ### FIXME What information should be shown? Portland? BJ?
                #annotation = spineapi.Annotation()
                #annotation['concept'] = 'PublisherIdentity'
                #annotation['property:logo'] = utopia.get_plugin_data_as_url('images/logo.png', 'image/png')
                #annotation['property:title'] = 'Portland Press Limited'
                #annotation['property:webpageUrl'] = 'http://www.portlandpress.com/'
                #document.addAnnotation(annotation, 'PublisherMetadata')

        # If this document was resolved, off we go to fetch the NLM
        if None not in (volume, page):
            # Make a request to the utopia ext web service
            url = 'https://utopia.cs.manchester.ac.uk/ext/portland/nlm?{0}'
            url = url.format(urllib.urlencode({'volume': volume, 'page': page}))
            try:
                nlm = urllib2.urlopen(url, timeout=8).read()
            except:
                raise
                return

            info = common.nlm.parse(nlm)
            if info is not None and len(info) > 0:

                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8')
                pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation))
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser)
                    for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'):
                        #print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info['citations_by_id'][pmids[pmid]]
                            for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')):
                                id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    #print 'KEY', key_name, id

                # Create Metadata link annotation
                link = document.newAccList('metadata', 100)
                link['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/biochemj.png', 'image/png')
                link['property:sourceTitle'] = 'Portland'
                link['property:sourceDescription'] = '''
                    <p><a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p>
                    '''

                # Create Metadata annotation
                annotation = spineapi.Annotation()
                annotation['concept'] = 'DocumentMetadata'
                for k in self.keys:
                    v = info.get(k)
                    if v is not None:
                        annotation['property:{0}'.format(k)] = v
                document.addAnnotation(annotation, link['scratch'])

                # Create Bibliography annotations
                for citation in info.get('citations', []):
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'DocumentReference'
                    for k in self.keys:
                        v = citation.get(k)
                        if v is not None:
                            annotation['property:{0}'.format(k)] = v
                    document.addAnnotation(annotation, link['scratch'])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info['citations']:
                    # Find cross refs
                    for pre, label, post in citation.get('contexts', []):
                        matches = document.findInContext(pre, label, post)
                        #print matches
                        if len(matches) > 0:
                            try:
                                annotation = spineapi.Annotation()
                                annotation['concept'] = 'ForwardCitation'
                                annotation['property:state'] = 'found'
                                if 'title' in citation:
                                    annotation['property:title'] = citation['title']
                                if 'id' in citation:
                                    annotation['property:bibid'] = citation['id']
                                if 'doi' in citation and citation['doi'].startswith('10.1371/'):
                                    citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi']))
                                if 'pmcid' in citation:
                                    citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid'])
                                for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'):
                                    if k in citation:
                                        annotation['property:{0}'.format(k)] = citation[k]
                                #print annotation.get('property:label'), annotation.get('property:pdf')
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation, link['scratch'])
                                #print citation
                            except:
                                raise
                                pass # FIXME

                for id, table in info.get('tables', {}).iteritems():
                    if 'caption' in table and 'xml' in table:
                        regex = fuzz(table['caption'], strict = True)
                        #print regex
                        matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation['concept'] = 'Table'
                            annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml'])
                            annotation['session:volatile'] = '1'
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link['scratch'])
                        else:
                            print '*********** failed to match table:', id
Example #47
0
    def on_ready_event(self, document):
        info = utopialib.nlm.parse(
            utopialib.utils.metadata(document, 'raw_pmc_nlm'))
        if info is not None and len(info) > 0:

            # Enrich citation information with identifiers from PMC
            parser = etree.XMLParser(ns_clean=True,
                                     recover=True,
                                     remove_blank_text=True,
                                     encoding='utf8')
            pmids = dict(((citation['pmid'], citation['id'])
                          for citation in info['citations']
                          if 'pmid' in citation and 'id' in citation))
            if len(pmids) > 0:
                pubmed_abstracts = etree.fromstring(
                    utopialib.eutils.efetch(id=','.join(pmids.keys()),
                                            retmode='xml',
                                            rettype='abstract'), parser)
                for idList in pubmed_abstracts.xpath(
                        'PubmedArticle/PubmedData/ArticleIdList'):
                    #print etree.tostring(idList)
                    pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                    if pmid in pmids:
                        citation = info['citations_by_id'][pmids[pmid]]
                        for key_name, id_name in (('doi', 'doi'),
                                                  ('pmcid', 'pmc'), ('pii',
                                                                     'pii')):
                            id = idList.findtext(
                                'ArticleId[@IdType="{0}"]'.format(id_name))
                            if key_name not in citation and id is not None:
                                citation[key_name] = id

            # Create Metadata link annotation
            link = document.newAccList('metadata', 50)
            link['property:sourceDatabase'] = 'pmc'
            link['property:sourceTitle'] = 'PubMed Central'
            link[
                'property:sourceDescription'] = '<p><a href="http://www.ncbi.nlm.nih.gov/pmc/">PubMed Central</a> is the U.S. National Institutes of Health (NIH) digital archive of biomedical and life sciences journal literature.</p>'

            # Create Metadata annotation
            annotation = utopialib.utils.citation_to_annotation(
                info.get('self', {}), 'DocumentMetadata')
            document.addAnnotation(annotation, link['scratch'])

            # Create Bibliography annotations
            for citation in info.get('citations', []):
                annotation = utopialib.utils.citation_to_annotation(citation)
                document.addAnnotation(annotation, link['scratch'])

            # Citations
            for citation in info['citations']:
                # Find cross refs
                for pre, label, post in citation.get('contexts', []):
                    matches = document.findInContext(pre, label, post)
                    #print matches
                    if len(matches) > 0:
                        try:
                            annotation = utopialib.utils.citation_to_annotation(
                                citation, concept='ForwardCitation')
                            if 'doi' in citation and citation[
                                    'doi'].startswith('10.1371/'):
                                citation[
                                    'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format(
                                        'info:doi/{0}'.format(citation['doi']))
                            if 'pmcid' in citation:
                                citation[
                                    'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(
                                        citation['pmcid'])
                            for match in matches:
                                annotation.addExtent(match)
                            document.addAnnotation(annotation, link['scratch'])
                        except:
                            raise

            # Tables
            for id, table in info.get('tables', {}).iteritems():
                if 'caption' in table and 'xml' in table:
                    regex = fuzz(table['caption'], strict=True)
                    matches = document.search(
                        regex, spineapi.RegExp + spineapi.IgnoreCase)
                    if len(matches) == 1:
                        annotation = spineapi.Annotation()
                        annotation['concept'] = 'Table'
                        annotation[
                            'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(
                                table['xml'])
                        annotation.addExtent(matches[0])
                        document.addAnnotation(annotation, link['scratch'])
                    else:
                        print '*********** failed to match table:', id
    def on_ready_event(self, document):
        volume, page = None, None

        # Only send if the DOI has a Portland prefix
        doi = utopialib.utils.metadata(document, 'identifiers[doi]')
        if doi is not None and doi[:7] in registrants:
            crossref_unixref = utopialib.utils.metadata(
                document, 'raw_crossref_unixref')
            if crossref_unixref is not None:
                # Parse CrossRef redirect URL
                dom = etree.fromstring(crossref_unixref.encode('utf8'))
                resource = dom.findtext(
                    'doi_record/crossref/journal/journal_article/doi_data/resource'
                )
                if resource is not None:
                    match = self.resourceRegExp.match(resource)
                    if match is not None:
                        volume, page = match.groups()

                ### FIXME What information should be shown? Portland? BJ?
                #annotation = spineapi.Annotation()
                #annotation['concept'] = 'PublisherIdentity'
                #annotation['property:logo'] = utopia.get_plugin_data_as_url('images/logo.png', 'image/png')
                #annotation['property:title'] = 'Portland Press Limited'
                #annotation['property:webpageUrl'] = 'http://www.portlandpress.com/'
                #document.addAnnotation(annotation, 'PublisherMetadata')

        # If this document was resolved, off we go to fetch the NLM
        if None not in (volume, page):
            # Make a request to the utopia ext web service
            url = 'https://utopia.cs.manchester.ac.uk/ext/portland/nlm?{0}'
            url = url.format(urllib.urlencode({
                'volume': volume,
                'page': page
            }))
            try:
                nlm = urllib2.urlopen(url, timeout=8).read()
            except:
                raise
                return

            info = utopialib.nlm.parse(nlm)
            if info is not None and len(info) > 0:

                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True,
                                         recover=True,
                                         remove_blank_text=True,
                                         encoding='utf8')
                pmids = dict(((citation['pmid'], citation['id'])
                              for citation in info['citations']
                              if 'pmid' in citation and 'id' in citation))
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(
                        utopialib.eutils.efetch(id=','.join(pmids.keys()),
                                                retmode='xml',
                                                rettype='abstract'), parser)
                    for idList in pubmed_abstracts.xpath(
                            'PubmedArticle/PubmedData/ArticleIdList'):
                        #print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info['citations_by_id'][pmids[pmid]]
                            for key_name, id_name in (('doi', 'doi'), ('pmcid',
                                                                       'pmc'),
                                                      ('pii', 'pii')):
                                id = idList.findtext(
                                    'ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    #print 'KEY', key_name, id

                # Create Metadata link annotation
                link = document.newAccList('metadata', 100)
                link['property:sourceIcon'] = utopia.get_plugin_data_as_url(
                    'images/biochemj.png', 'image/png')
                link['property:sourceTitle'] = 'Portland'
                link['property:sourceDescription'] = '''
                    <p><a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p>
                    '''

                # Create Metadata annotation
                annotation = utopialib.utils.citation_to_annotation(
                    info.get('self', {}), 'DocumentMetadata')
                document.addAnnotation(annotation, link['scratch'])

                # Create Bibliography annotations
                for citation in info.get('citations', []):
                    annotation = utopialib.utils.citation_to_annotation(
                        citation)
                    document.addAnnotation(annotation, link['scratch'])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info['citations']:
                    # Find cross refs
                    for pre, label, post in citation.get('contexts', []):
                        matches = document.findInContext(pre, label, post)
                        #print matches
                        if len(matches) > 0:
                            try:
                                annotation = utopialib.utils.citation_to_annotation(
                                    citation, concept='ForwardCitation')
                                if 'doi' in citation and citation[
                                        'doi'].startswith('10.1371/'):
                                    citation[
                                        'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format(
                                            'info:doi/{0}'.format(
                                                citation['doi']))
                                if 'pmcid' in citation:
                                    citation[
                                        'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(
                                            citation['pmcid'])
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation,
                                                       link['scratch'])
                            except:
                                raise

                for id, table in info.get('tables', {}).iteritems():
                    if 'caption' in table and 'xml' in table:
                        regex = fuzz(table['caption'], strict=True)
                        #print regex
                        matches = document.search(
                            regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation['concept'] = 'Table'
                            annotation[
                                'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(
                                    table['xml'])
                            annotation['session:volatile'] = '1'
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link['scratch'])
                        else:
                            print '*********** failed to match table:', id
    def on_ready_event(self, document):
        info = common.nlm.parse(common.utils.metadata(document, "raw_pmc_nlm"))
        if info is not None and len(info) > 0:

            # Enrich citation information with identifiers from PMC
            parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding="utf8")
            pmids = dict(
                (
                    (citation["pmid"], citation["id"])
                    for citation in info["citations"]
                    if "pmid" in citation and "id" in citation
                )
            )
            if len(pmids) > 0:
                pubmed_abstracts = etree.fromstring(
                    common.eutils.efetch(id=",".join(pmids.keys()), retmode="xml", rettype="abstract"), parser
                )
                for idList in pubmed_abstracts.xpath("PubmedArticle/PubmedData/ArticleIdList"):
                    # print etree.tostring(idList)
                    pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                    if pmid in pmids:
                        citation = info["citations_by_id"][pmids[pmid]]
                        for key_name, id_name in (("doi", "doi"), ("pmcid", "pmc"), ("pii", "pii")):
                            id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name))
                            if key_name not in citation and id is not None:
                                citation[key_name] = id

            # Create Metadata link annotation
            link = document.newAccList("metadata", 50)
            link["property:sourceDatabase"] = "pmc"
            link["property:sourceTitle"] = "PubMed Central"
            link[
                "property:sourceDescription"
            ] = '<p><a href="http://www.ncbi.nlm.nih.gov/pmc/">PubMed Central</a> is the U.S. National Institutes of Health (NIH) digital archive of biomedical and life sciences journal literature.</p>'

            # Create Metadata annotation
            annotation = spineapi.Annotation()
            annotation["concept"] = "DocumentMetadata"
            for k in self.keys:
                v = info.get(k)
                if v is not None:
                    annotation["property:{0}".format(k)] = v
            document.addAnnotation(annotation, link["scratch"])

            # Create Bibliography annotations
            for citation in info.get("citations", []):
                annotation = spineapi.Annotation()
                annotation["concept"] = "DocumentReference"
                for k in self.keys:
                    v = citation.get(k)
                    if v is not None:
                        annotation["property:{0}".format(k)] = v
                document.addAnnotation(annotation, link["scratch"])

            # Citations
            for citation in info["citations"]:
                # Find cross refs
                for pre, label, post in citation.get("contexts", []):
                    matches = document.findInContext(pre, label, post)
                    # print matches
                    if len(matches) > 0:
                        try:
                            annotation = spineapi.Annotation()
                            annotation["concept"] = "ForwardCitation"
                            annotation["property:state"] = "found"
                            if "title" in citation:
                                annotation["property:title"] = citation["title"]
                            if "id" in citation:
                                annotation["property:bibid"] = citation["id"]
                            if "doi" in citation and citation["doi"].startswith("10.1371/"):
                                citation[
                                    "pdf"
                                ] = "http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF".format(
                                    "info:doi/{0}".format(citation["doi"])
                                )
                            if "pmcid" in citation:
                                citation["pdf"] = "http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/".format(
                                    citation["pmcid"]
                                )
                            # print citation
                            for k in self.keys + ("authors", "pdf", "first_author_surname"):
                                if k in citation:
                                    annotation["property:{0}".format(k)] = citation[k]
                            # print annotation.get('property:label'), annotation.get('property:pdf')
                            for match in matches:
                                annotation.addExtent(match)
                            document.addAnnotation(annotation, link["scratch"])
                            # print citation
                        except:
                            raise
                            pass  # FIXME

            # Tables
            for id, table in info.get("tables", {}).iteritems():
                if "caption" in table and "xml" in table:
                    regex = fuzz(table["caption"], strict=True)
                    matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase)
                    if len(matches) == 1:
                        annotation = spineapi.Annotation()
                        annotation["concept"] = "Table"
                        annotation[
                            "session:upload_files"
                        ] = "data:application/xml;name=data.xml;base64,%s" % base64.standard_b64encode(table["xml"])
                        annotation.addExtent(matches[0])
                        document.addAnnotation(annotation, link["scratch"])
                    else:
                        print "*********** failed to match table:", id
Example #50
0
    def after_ready_event(self, document):
        # Make an annotation for all these metadata
        ids = {
            'doi': ('DOI', u'<a href="http://dx.doi.org/{0}">{0}</a>'),
            'issn': ('ISSN', u'<strong>{0}</strong>'),
            'pii': ('PII', u'<strong>{0}</strong>'),
            'pubmed':
            ('Pubmed',
             u'<a href="http://www.ncbi.nlm.nih.gov/pubmed/{0}">{0}</a>'),
            'pmc':
            ('PMC',
             u'<a href="http://www.ncbi.nlm.nih.gov/pmc/articles/{0}">{0}</a>'
             ),
            'arxiv': ('arXiv', u'<a href="http://arxiv.org/abs/{0}">{0}</a>'),
        }
        # Build list of fragments
        fragments = []
        pub_icon = ''
        html = '''
            <style>
              .fancy_quotes {
                position: relative;
              }
              .fancy_quotes:before {
                content: "\\201C";
              }
              .fancy_quotes:after {
                content: "\\201D";
              }
            </style>
        '''

        for key, (name, format) in ids.iteritems():
            id = utopialib.utils.metadata(document,
                                          'identifiers[{0}]'.format(key))
            if id is not None:
                fragments.append(
                    u'<td style="text-align: right; opacity: 0.7">{0}:</td><td>{1}</td>'
                    .format(name, format.format(id)))
        issn = utopialib.utils.metadata(document, 'publication-issn')
        if issn is not None:
            fragments.append(
                u'<td style="text-align: right; opacity: 0.7">{0}:</td><td><strong>{1}</strong></td>'
                .format('ISSN', issn))
        # Resolve publisher info
        for annotation in document.annotations('PublisherMetadata'):
            if annotation.get('concept') == 'PublisherIdentity':
                logo = annotation.get('property:logo')
                title = annotation.get('property:title')
                webpageUrl = annotation.get('property:webpageUrl')
                if None not in (logo, title, webpageUrl):
                    pub_icon = u'<a href="{0}" title="{2}"><img src="{1}" alt="{2}" /></a></td>'.format(
                        webpageUrl, logo, title)
                    break
        # Compile fragments
        title = utopialib.utils.metadata(document, 'title')
        if title is not None or len(pub_icon) > 0:
            html += u'<table style="border: none; margin: 0 0 1em 0;">'
            html += u'<tr>'
            if title is not None:
                html += u'<td style="text-align:left; vertical-align: middle;"><strong class="nohyphenate fancy_quotes">{0}</strong></td>'.format(
                    title.strip())
            if len(pub_icon) > 0:
                html += u'<td style="text-align:right; vertical-align: middle; width: 80px;">{0}</td>'.format(
                    pub_icon)
            html += u'</tr>'
            html += u'</table>'
        if len(fragments) > 0:
            html += u'<div class="box">'
            html += u'<table style="border: none">'
            html += u'<tr>'
            html += u'</tr><tr>'.join(fragments)
            html += u'</tr>'
            html += u'</table>'
            html += u'</div>'

            annotation = spineapi.Annotation()
            annotation['concept'] = 'Collated'
            annotation['property:html'] = html
            annotation['property:name'] = 'About this article'
            annotation['session:weight'] = '1000'
            annotation['session:default'] = '1'
            annotation['session:headless'] = '1'
            document.addAnnotation(annotation)
Example #51
0
    def on_ready_event(self, document):
        # Get resolved DOI
        doi = utopia.tools.utils.metadata(document, 'identifiers[doi]')

        # Only for PLOS DOIs should this plugin do anything
        if doi is not None and doi.startswith('10.1371/'):

            # Record the publisher identity information
            annotation = spineapi.Annotation()
            annotation['concept'] = 'PublisherIdentity'
            annotation['property:logo'] = utopia.get_plugin_data_as_url(
                'images/large_logo.jpg', 'image/jpg')
            annotation['property:title'] = 'PLOS'
            annotation['property:webpageUrl'] = 'http://www.plos.org/'
            document.addAnnotation(annotation, 'PublisherMetadata')

            # Attempt to get ALMs from PLOS API
            query = {
                'api_key': self.api_key,
                'info': 'detail',
                'ids': doi,
                'type': 'doi'
            }
            url = 'http://alm.plos.org/api/v5/articles?{0}'.format(
                urllib.urlencode(query))
            request = urllib2.Request(url,
                                      headers={'Accepts': 'application/json'})
            try:
                data = urllib2.urlopen(request, timeout=8).read()
                alm = json.loads(data)
            # Not found
            except urllib2.HTTPError as e:
                if e.code == 404:  # just ignore 404
                    return
                raise

            articles = alm.get('data', [])
            if len(articles) > 0:
                article = articles[0]
                metrics = dict(((source.get('name'), source.get('metrics'))
                                for source in article.get('sources', [])))

                plos_pdf_views = metrics.get('counter', {}).get('pdf') or 0
                plos_html_views = metrics.get('counter', {}).get('html') or 0
                pmc_pdf_views = metrics.get('pmc', {}).get('pdf') or 0
                pmc_html_views = metrics.get('pmc', {}).get('html') or 0

                annotation = spineapi.Annotation()
                annotation['concept'] = 'PLOSALMRecord'
                annotation['property:doi'] = doi
                annotation['property:name'] = 'PLOS'
                annotation['property:description'] = 'Download statistics'
                annotation['property:plos_pdf_views'] = plos_pdf_views
                annotation['property:plos_html_views'] = plos_html_views
                annotation['property:pmc_pdf_views'] = pmc_pdf_views
                annotation['property:pmc_html_views'] = pmc_html_views
                annotation[
                    'property:sourceIcon'] = utopia.get_plugin_data_as_url(
                        'images/small_logo.png', 'image/png')
                annotation[
                    'property:sourceDescription'] = '<p><a href="http://www.plos.org/">PLOS</a> article level metrics for downloads.</p>'
                document.addAnnotation(annotation)
    def on_ready_event(self, document):
        # See if we have any publishers' NLM hosted for this DOI
        doi = common.utils.metadata(document, 'doi')
        #print '----- DOI', doi
        if doi is not None:
            info = None
            try:
                url = 'https://utopia.cs.manchester.ac.uk/ext/hosted/nlm?'
                url += urllib.urlencode({'doi': doi.encode('utf8')})
                nlm = urllib2.urlopen(url, timeout=8).read()
                info = common.nlm.parse(nlm)
            except (urllib2.URLError, socket.timeout):
                # info will remain None
                pass

            #print info
            if info is not None and len(info) > 0:

                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8')
                pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation))
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser)
                    for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'):
                        #print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info['citations_by_id'][pmids[pmid]]
                            for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')):
                                id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    #print 'KEY', key_name, id

                # Generate sensible titles / descriptions / icons?
                journalTitle = info.get('publication-title', '')
                journalTitleSuffix = ''
                publisher = info.get('publisher', 'the publisher')
                if len(journalTitle) > 0:
                    journalTitleSuffix = ' ({0})'.format(journalTitle)

                # Create Metadata link annotation
                link = document.newAccList('metadata', 100)
                link['property:sourceIcon'] = ''
                link['property:sourceTitle'] = publisher
                link['property:sourceDescription'] = '''
                    <p>This information was provided by {0}{1}.</p>
                    '''.format(publisher, journalTitleSuffix)

                # Publisher identity
                if doi[:8] in ('10.1104/', '10.1105/'):
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'PublisherIdentity'
                    logo = utopia.get_plugin_data_as_url('images/aspb_logo.png', 'image/png')
                    webpageUrl = 'http://www.aspb.org/'
                    title = publisher
                    #print '====', publisher, '---', journalTitle, '---', webpageUrl
                    if doi.startswith('10.1104/'):
                        logo = utopia.get_plugin_data_as_url('images/pp_logo.png', 'image/png')
                        title = journalTitle
                        webpageUrl = 'http://www.plantphysiol.org/'
                    elif doi.startswith('10.1105/'):
                        logo = utopia.get_plugin_data_as_url('images/tpc_logo.png', 'image/png')
                        title = journalTitle
                        webpageUrl = 'http://www.plantcell.org/'

                    annotation['property:logo'] = logo
                    annotation['property:title'] = title
                    annotation['property:webpageUrl'] = webpageUrl
                    document.addAnnotation(annotation, 'PublisherMetadata')

                    link['property:sourceIcon'] = logo
                    link['property:sourceTitle'] = title

                # Create Metadata annotation
                annotation = spineapi.Annotation()
                annotation['concept'] = 'DocumentMetadata'
                for k in self.keys:
                    v = info.get(k)
                    if v is not None:
                        annotation['property:{0}'.format(k)] = v
                document.addAnnotation(annotation, link['scratch'])

                # Create Bibliography annotations
                for citation in info.get('citations', []):
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'DocumentReference'
                    for k in self.keys:
                        v = citation.get(k)
                        if v is not None:
                            annotation['property:{0}'.format(k)] = v
                    document.addAnnotation(annotation, link['scratch'])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info['citations']:
                    # Find cross refs
                    for pre, label, post in citation.get('contexts', []):
                        matches = document.findInContext(pre, label, post)
                        #print matches
                        if len(matches) > 0:
                            try:
                                annotation = spineapi.Annotation()
                                annotation['concept'] = 'ForwardCitation'
                                annotation['property:state'] = 'found'
                                if 'title' in citation:
                                    annotation['property:title'] = citation['title']
                                if 'id' in citation:
                                    annotation['property:bibid'] = citation['id']
                                if 'doi' in citation and citation['doi'].startswith('10.1371/'):
                                    citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi']))
                                if 'pmcid' in citation:
                                    citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid'])
                                for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'):
                                    if k in citation:
                                        annotation['property:{0}'.format(k)] = citation[k]
                                #print annotation.get('property:label'), annotation.get('property:pdf')
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation, link['scratch'])
                                #print citation
                            except:
                                raise
                                pass # FIXME

                for id, table in info.get('tables', {}).iteritems():
                    if 'caption' in table and 'xml' in table:
                        regex = fuzz(table['caption'], strict = True)
                        #print regex

                        # convert oasis tables
                        ns = {'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table'}
                        xml = etree.fromstring(table['xml'])
                        if xml.tag == '{{{0}}}table'.format(ns['oasis']):
                            for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns):
                                columns = {}
                                for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns):
                                    columns[colspec.get('colname')] = int(colspec.get('colnum'))
                                for section in tgroup.xpath('./oasis:thead|./oasis:tbody', namespaces=ns):
                                    isHead = (section.tag == '{{{0}}}thead'.format(ns['oasis']))
                                    for row in section.xpath('./oasis:row', namespaces=ns):
                                        for entry in row.xpath('./oasis:entry', namespaces=ns):
                                            colname = entry.get('colname')
                                            colst = entry.get('namest')
                                            colend = entry.get('nameend')
                                            if colst is not None and colend is not None:
                                                colspan = columns[colend] - columns[colst] + 1
                                            else:
                                                colspan = 1
                                            if colspan > 1:
                                                entry.set('colspan', unicode(colspan))
                                            morerows = entry.get('morerows')
                                            if morerows is not None:
                                                rowspan = int(morerows) + 1
                                            else:
                                                rowspan = 1
                                            if rowspan > 1:
                                                entry.set('rowspan', unicode(rowspan))
                                            entry.tag = 'td'
                                        row.tag = 'tr'
                                    if isHead:
                                        section.tag = 'thead'
                                    else:
                                        section.tag = 'tbody'
                                    xml.append(section)
                                xml.tag = 'table'
                                #print etree.tostring(xml, pretty_print=True, encoding='utf8')
                                table['xml'] = etree.tostring(xml, encoding='utf8')

                        matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation['concept'] = 'Table'
                            annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml'])
                            annotation['session:volatile'] = '1'
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link['scratch'])
                        else:
                            print '*********** failed to match table:', id
Example #53
0
    def on_ready_event(self, document):
        css = '''
        <style>
            .discussion .author {
                font-weight: bold;
            }
            .discussion .timestamp {
                font-size: 0.9em;
                font-style: italic;
            }
            .discussion .author a {
                color: black !important;
            }
            .discussion .timestamp a {
                color: inherit !important;
            }
        </style>
        '''

        identifiers = utopia.tools.utils.metadata(document, 'identifiers')
        if identifiers is None or len(identifiers) == 0:
            return

        discussions = self._get_discussion(identifiers)

        if discussions is None or not isinstance(discussions, dict):
            return

        for discussion_source in discussions.get('discussions', []):

            comments = []
            mentions = {}

            for comment in sorted(discussion_source.get('comments', []),
                                  key=lambda c: c['timestamp']):

                comments.append(
                    u'<div class="box commnt limited-height">{}<p></p>{}</div>'
                    .format(self._format_header(comment), comment['content']))

            for mention in discussion_source.get('mentions', []):

                key = self._identifiers_to_key(mention['identifiers'])

                mentions[key] = mention['identifiers']

            if not mentions and not comments:
                continue

            html = u'<div class="discussion">'
            a = spineapi.Annotation()

            a['concept'] = 'Collated'
            a['property:name'] = discussion_source['source']['title']
            a['property:sourceDatabase'] = discussion_source['source']['title']
            a['property:sourceDescription'] = '<p>{}</p>'.format(
                discussion_source['source']['description'])
            a['property:sourceIcon'] = discussion_source['source']['icon']

            a['property:description'] = 'Comments related to this article'

            if comments:

                html += u''
                html += u'\n\n'.join(comments)

            if mentions:

                mention_html = u'\n\n'.join([
                    utopia.citation.render(dict(identifiers=citation),
                                           process=True,
                                           links=True)
                    for citation in mentions.itervalues()
                ])

                html += u'<div class="box"><p>This article was mentioned by a comment in:</p>{}</div>\n\n'.format(
                    mention_html)

            html += '</div>'
            a['property:html'] = css, html

            document.addAnnotation(a)
    def on_ready_event(self, document):
        #print "RUNNING DRYAD PLUGIN"
        doi = common.utils.metadata(document, 'doi')

        if doi is not None:

            # see if kend knows about this DOI as a Dryad record
            response = urllib2.urlopen('https://utopia.cs.man.ac.uk/kend/0.7/define/lookup?database=dryad&term=%s&limit=1000' % doi, timeout=8)
            root = etree.fromstring(response.read())
            dryadShortHandle = root.findtext('kend:group/kend:annotation/kend:properties/property:databaseTerm', namespaces=ns)
            if dryadShortHandle is not None:
                # then we have a dryad short-form doi, so can now safely go to dryad to get the rest

                response = urllib2.urlopen('http://datadryad.org/solr/search/select/?q=dc.relation.isreferencedby:%s&fl=dc.identifier,dc.title_ac,dc.identifier.uri,dc.contributor.author,dc.date.issued.year,dc.identifier.citation,dc.description' % doi, timeout=8)
                root = etree.fromstring(response.read())
                #print etree.tostring(root, pretty_print=True, encoding='utf8')
                result = root.find('result')

                if result.attrib['numFound'] != '0':
                    # then we have found some datasets for this article DOI

                    packageDetails = urllib2.urlopen('http://datadryad.org/metadata/handle/%s/mets.xml' % dryadShortHandle, timeout=8)
                    root = etree.fromstring(packageDetails.read())
                    #print etree.tostring(root, pretty_print=True, encoding='utf8')

                    identifiers = root.findall('mets:dmdSec/mets:mdWrap/mets:xmlData/dim:dim/dim:field[@element="identifier"]', namespaces=ns)
                    packageDOI = None
                    for identifier in identifiers:
                        if identifier.text.startswith('doi:'):
                            packageDOI = identifier.text[4:]
                            break

                    contributors = root.findall('mets:dmdSec/mets:mdWrap/mets:xmlData/dim:dim/dim:field[@qualifier="author"]', namespaces=ns)
                    dataCitation = {
                        'year': result.findtext("doc/arr[@name='dc.date.issued.year']/int", namespaces=ns),
                        'title': root.findtext('mets:dmdSec/mets:mdWrap/mets:xmlData/dim:dim/dim:field[@element="title"]', namespaces=ns),
                        'authors': [string.capwords(a.text) for a in contributors],
                        'source': 'Dryad Digital Repository',
                        'doi': packageDOI,
                    }
                    articleCitation = root.findtext('mets:dmdSec/mets:mdWrap/mets:xmlData/dim:dim/dim:field[@qualifier="citation"][@element="identifier"]', namespaces=ns)

                    xhtml = '''
                        <p>
                          The data associated with this article are available via Dryad. When using
                          these data, please cite both the article:
                        </p>
                        <div class="box">{0}<br /><a href="http://dx.doi.org/{1}">doi:{1}</a></div>
                        <p>
                          and also the data package:
                        </p>
                        <div class="box">{2}<br /><a href="http://dx.doi.org/{3}">doi:{3}</a></div>
                    '''.format(articleCitation, doi, common.utils.format_citation(dataCitation), dataCitation['doi'])

                    a = spineapi.Annotation()
                    a['concept'] = 'Dryad'
                    a['property:name'] = 'Dryad'
                    a['property:sourceDatabase'] = 'dryad'
                    a['property:sourceDescription'] = '<p><a href="http://datadryad.org/">Dryad</a> is an international repository of data underlying peer-reviewed articles in the basic and applied biosciences.</p>'
                    a['property:description'] = 'Data associated with this article'
                    a['property:xhtml'] = xhtml
                    document.addAnnotation(a)
 def on_filter_event(self, document, data=None):
     for a in document.annotations():
         if a.get(
                 'author'
         ) == 'http://utopia.cs.manchester.ac.uk/users/11679' and a.get(
                 'concept') in ('Definition', 'DatabaseEntry'
                                ) and 'session:legacy' not in a:
             document.removeAnnotation(a)
             identifier = a.get('property:identifier', '')
             if identifier.startswith('http://bio2rdf.org/pdb:'):
                 # PDB entry
                 a2 = spineapi.Annotation()
                 a2['concept'] = 'DatabaseEntry'
                 a2['author'] = a['author']
                 a2['session:volatile'] = '1'
                 a2['session:legacy'] = '1'
                 a2['property:sourceDatabase'] = 'pdb'
                 a2['property:sourceDescription'] = '<p>The <a href="http://www.rcsb.org/">Protein Data Bank</a> of the Research Collaboratory for Structural Bioinformatics (<a href="http://home.rcsb.org/">RCSB</a>).</p>'
                 a2['property:identifier'] = identifier
                 a2['property:description'] = 'PDB entry {0}'.format(
                     identifier[-4:].upper())
                 if 'property:name' in a:
                     a2['property:name'] = a['property:name'][:-11]
                 if 'property:imageUrl' in a:
                     a2['property:imageUrl'] = a['property:imageUrl']
                 if 'property:molecularDescription' in a:
                     a2['property:molecularDescription'] = a[
                         'property:molecularDescription']
                 if 'property:webpageUrl' in a:
                     a2['property:webpageUrl'] = a['property:webpageUrl']
                 if 'property:embedded' in a:
                     a2['property:embedded'] = a['property:embedded']
                 for extent in a.extents():
                     a2.addExtent(extent)
                 for area in a.areas():
                     a2.addArea(area)
                 document.addAnnotation(a2)
             if identifier.startswith('http://dbpedia.org/resource/'):
                 # Wikipedia entry
                 a2 = spineapi.Annotation()
                 a2['concept'] = 'Definition'
                 a2['author'] = a['author']
                 a2['session:volatile'] = '1'
                 a2['session:legacy'] = '1'
                 a2['property:sourceDatabase'] = 'wikipedia'
                 a2['property:sourceDescription'] = '<p>Structured <a href="http://www.wikipedia.org/">Wikipedia</a> information provided by the <a href="http://DBpedia.org/">DBpedia</a> project.</p>'
                 a2['property:description'] = a.get('property:summary',
                                                    'Wikipedia entry')
                 if 'property:name' in a:
                     a2['property:name'] = a['property:name']
                 if 'property:identifier' in a:
                     a2['property:identifier'] = a['property:identifier']
                 if 'property:imageUrl' in a:
                     a2['property:imageUrl'] = a['property:imageUrl']
                 if 'property:summary' in a:
                     a2['property:summary'] = a['property:summary']
                 if 'property:webpageUrl' in a:
                     a2['property:webpageUrl'] = a['property:webpageUrl']
                 for extent in a.extents():
                     a2.addExtent(extent)
                 for area in a.areas():
                     a2.addArea(area)
                 document.addAnnotation(a2)
             if identifier.startswith(
                     'http://www.portlandpress.com/utopia/glick/'):
                 # Wikipedia entry
                 a2 = spineapi.Annotation()
                 a2['concept'] = 'Definition'
                 a2['author'] = a['author']
                 a2['session:volatile'] = '1'
                 a2['session:legacy'] = '1'
                 a2['property:sourceDatabase'] = 'glick'
                 a2['property:sourceDescription'] = '<p>David M. Glick\'s <a href="http://www.portlandpress.com/pp/books/online/glick/search.htm">Glossary of Biochemistry and Molecular Biology</a>.</p><p>Made available by <a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p>'
                 a2['property:description'] = a[
                     'property:description'] + '<p><em>(Glick Glossary/Portland Press Ltd.)</em></p>'
                 a2['property:name'] = a['property:name']
                 for extent in a.extents():
                     a2.addExtent(extent)
                 for area in a.areas():
                     a2.addArea(area)
                 document.addAnnotation(a2)
    def after_ready_event(self, document):
        # Get (if present) the RSCMetadataLink annotation
        for annotation in document.annotations():
            if annotation.get('concept') == 'RSCMetadataLink':
                text = document.text()

                doi = annotation['property:doi'].upper()
                rscId = annotation['property:rscId'].upper()

                xmlquery = '<SearchCriteria><SearchTerm><Category>Journal</Category><ContentType>All</ContentType><Criterias><NameValue><Name>FreeText</Name><Value>"%s"</Value></NameValue></Criterias><Source>Utopia</Source></SearchTerm><PageNo>1</PageNo><PageSize>10</PageSize><SortBy>Relevance</SortBy></SearchCriteria>' % doi

                baseurl = 'http://pubs.rsc.org/en/federated/search'
                params = { 'federatedsearchname': 'Utopia',
                           'inputxml': xmlquery }
                url = baseurl + '?%s' % urllib.urlencode(params)
                searchresult = urllib2.urlopen(url, timeout=14).read()
                root = etree.fromstring(searchresult)
                #print etree.tostring(root, pretty_print=True, encoding='utf8')

                articles = root.findall('./{http://www.rsc.org/schema/rscart38}article')
                #print articles

                # the search use above can return more than one article, so select out only the one with
                # the correct doi

                thearticle = None
                articleID = None
                for article in articles:
                    found_doi = article.findtext("./{http://www.rsc.org/schema/rscart38}metainfo/{http://www.rsc.org/schema/rscart38}meta[@field='doi']")
                    if found_doi is None:
                        found_doi = article.findtext("./{http://www.rsc.org/schema/rscart38}art-admin/{http://www.rsc.org/schema/rscart38}doi")
                    if found_doi is not None and found_doi.upper() == doi:
                        thearticle = article
                        articleIDelem = article.find("./{http://www.rsc.org/schema/rscart38}metainfo/{http://www.rsc.org/schema/rscart38}meta[@field='docid']")
                        if articleIDelem is not None:
                            articleID = articleIDelem.text
                        break

                # if we get back a single valid article...
                if thearticle != None:
                    #print articleID

                    compoundsInArticle = []
                    compoundText = {}

                    annotationsInArticle = []
                    annotationText = {}

                    # create a list of all the compounds that are mentioned in the article body
                    compnames = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-body/{http://www.rsc.org/schema/rscart38}compname')
                    #print compnames
                    for compname in compnames:
                        # This line removes (erroneous?) elements from inside the XML
                        etree.strip_elements(compname, '{http://www.rsc.org/schema/rscart38}compound', with_tail=False)
                        #print compname.attrib['idrefs'], compname.text
                        compoundsInArticle.append(compname.attrib['idrefs'])
                        compoundText[compname.attrib['idrefs']] = etree.tounicode(compname, method='text')

                    annotationnames = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-body/{http://www.rsc.org/schema/rscart38}annref')
                    #print annotationnames
                    for annotationname in annotationnames:
                        # This line removes (erroneous?) elements from inside the XML
                        etree.strip_elements(annotationname, '{http://www.rsc.org/schema/rscart38}annotation', with_tail=False)
                        #print annotationname.attrib['idrefs'], annotationname.text
                        annotationsInArticle.append(annotationname.attrib['idrefs'])
                        annotationText[annotationname.attrib['idrefs']] = etree.tounicode(annotationname, method='text')

                    #print compoundText, annotationText
                    #sprint annotationsInArticle

                    # then for all the compounds that are defined in the article back
                    compounds = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-back/{http://www.rsc.org/schema/rscart38}compound')
                    for compound in compounds:
                        id = compound.attrib['id']
                        if id in compoundsInArticle:
                            url = None
                            id = compound.attrib['id']

                            # if the compound has a CSID, then the URL links to the chemspider page
                            csid = compound.find("./{http://www.rsc.org/schema/rscart38}link[@type='CSID']" )

                            # if the compound has a CSID, create a Chemspider URL for it
                            if csid is not None and csid.text is not None:
                                url = 'http://www.chemspider.com/Chemical-Structure.%s.html' % csid.text[5:]
                            else:
                                # otherwise, use the RSC landing page
                                url = 'http://www.rsc.org/publishing/journals/prospect/cheminfo.asp?XMLID=%s&compoundtext=%s&MSID=%s' % (id[4:], compoundText[id], articleID)

                            if url is not None:
                                options = spineapi.WholeWordsOnly + spineapi.IgnoreCase
                                matches = document.search(compoundText[id], options)
                                annotation = spineapi.Annotation()
                                annotation['concept'] = 'Hyperlink'
                                annotation['property:webpageUrl'] = url
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation)

                    # similarly, for all the annotations
                    annotations = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-back/{http://www.rsc.org/schema/rscart38}annotation')
                    for annotation in annotations:
                        id = annotation.attrib['id']
                        url = None
                        if id in annotationsInArticle:
                            id = annotation.attrib['id']

                            # get the link id
                            link = annotation.findtext("./{http://www.rsc.org/schema/rscart38}link" )

                            # if the compound has a link, create an RSC ontology landing page for it
                            if link is not None:
                                if link[:3] == 'AU:':
                                    url = 'http://goldbook.iupac.org/%s.html' % link[3:]
                                else:
                                    url = 'http://www.rsc.org/publishing/journals/prospect/ontology.asp?id=%s&MSID=%s' % (link, articleID)

                            if url is not None:
                                matches = document.search(annotationText[id], spineapi.IgnoreCase + spineapi.WholeWordsOnly)
                                annotation = spineapi.Annotation()
                                annotation['concept'] = 'Hyperlink'
                                annotation['property:webpageUrl'] = url
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation)
                break
Example #57
0
    def on_ready_event(self, document):
        '''Fetch information from the Lazarus service'''

        permission = self.get_config('permission', False)
        if permission:
            # If an outline already exists, don't make a new one
            needs_outline = True
            for annotation in document.annotations():
                if annotation.get('concept') == 'OutlineItem':
                    needs_outline = False
                    break

            # The Lazarus server needs to know what this document is
            document_id = utopia.tools.utils.metadata(document,
                                                      'identifiers[utopia]')
            this_doi = utopia.tools.utils.metadata(document,
                                                   'identifiers[doi]')
            if this_doi is not None:
                this_doi = u'doi:' + this_doi

            # Speak to server
            params = {'fingerprint': document.fingerprints()}
            url = '{0}?{1}'.format(laz_docUrl,
                                   urllib.urlencode(params, doseq=True))
            response = urllib2.urlopen(url, timeout=60)
            if response.getcode() == 204:
                request = urllib2.Request(
                    url,
                    data=document.data(),
                    headers={'Content-Type': 'application/pdf'})
                response = urllib2.urlopen(request, timeout=60)
            #response = open('/Users/dave/Desktop/ananiadou_tibtech06.pdf-response.xml', 'r')

            # Create Metadata link annotation
            link = document.newAccList('metadata', 50)
            link['property:sourceDatabase'] = 'lazarus'
            link['property:sourceTitle'] = 'Lazarus'
            link['property:sourceDescription'] = self.sourceDescription
            link['property:sourceIcon'] = utopia.get_plugin_data_as_url(
                'images/lazarus-prefs-logo.png', 'image/png')

            headers = []
            pos = []
            refs = []
            annotations = []
            concepts = {}
            hits = []
            expression_annotations = []
            for kAnnotation in kend.converter.XML.parse(
                    response, kend.model.Document):
                #print kend.converter.XML.serialise(kAnnotation)[0]
                try:
                    annotations.append(
                        utopia.tools.converters.Annotation.kend2spineapi(
                            kAnnotation, document))
                except:
                    pass
            annotations.sort(key=lambda a: int(a.get('structure:order', 0)))
            for sAnnotation in annotations:
                if sAnnotation['concept'] == 'structure_element':
                    role, level = self.getHeaderRole(sAnnotation)
                    if role is not None and needs_outline:
                        while len(pos) < level:
                            pos.append(0)
                        while len(pos) > level:
                            pos.pop()
                        pos[-1] += 1

                        outline = u'.'.join([unicode(i) for i in pos])
                        anchor_name = '#lazarus.outline.{0}'.format(outline)

                        anchor = spineapi.Annotation()
                        anchor['concept'] = 'Anchor'
                        anchor['property:anchor'] = anchor_name
                        anchor.addExtents(sAnnotation.extents())
                        anchor.addAreas(sAnnotation.areas())
                        document.addAnnotation(anchor)

                        header = spineapi.Annotation()
                        header['concept'] = 'OutlineItem'
                        header['property:outlinePosition'] = outline
                        header['property:outlineTitle'] = u' '.join(
                            [e.text() for e in sAnnotation.extents()])
                        header['property:destinationAnchorName'] = anchor_name
                        document.addAnnotation(header)

                        print((u'    ' * level +
                               u'.'.join([unicode(i)
                                          for i in pos]) + u' ' + u' '.join([
                                              e.text()
                                              for e in sAnnotation.extents()
                                          ])).encode('utf8'))
                    elif 'bibitem' in sAnnotation.getAllProperties(
                            'structure:role'):
                        #refs.append(sAnnotation)
                        pass
                elif sAnnotation['concept'] == 'Citation':
                    # Hack to fix a mistake in authors property name
                    if 'property:author' in sAnnotation and not 'property:authors' in sAnnotation:
                        sAnnotation[
                            'property:authors'] = sAnnotation.getAllProperties(
                                'property:author')
                    refs.append(sAnnotation)
                elif sAnnotation['concept'] == 'LazarusConcept':
                    concept_id = sAnnotation.get('property:identifier')
                    if concept_id is not None:
                        sAnnotation['id'] = str(uuid.uuid4())
                        concepts[concept_id] = sAnnotation
                        document.addAnnotation(sAnnotation, 'Lazarus Concept')
                elif sAnnotation['concept'] == 'LazarusConceptHit':
                    hits.append(sAnnotation)
                elif sAnnotation['concept'] == 'LazarusSentenceExpression':
                    expression_annotations.append(sAnnotation)
                else:
                    document.addAnnotation(sAnnotation)

            for ref in refs:
                #print(ref.get('structure:order', '0'))
                pass
            refs = sorted(refs,
                          key=lambda ref: int(ref.get('property:order', '0')))

            for ref in refs:
                #print(ref.get('structure:order', '0'))
                pass
            for ref in refs:
                # Create Bibliography annotations
                #citation = {'unstructured': u' '.join([e.text() for e in ref.extents()])}
                #annotation = utopia.tools.utils.citation_to_annotation(citation)
                #annotation['property:order'] = ref.get('structure:order')
                #annotation.addExtents(ref.extents())
                #annotation.addAreas(ref.areas())
                #document.addAnnotation(annotation, link['scratch'])
                document.addAnnotation(ref, link['scratch'])

            # Now link hits to concepts
            for i, hit in enumerate(hits):
                concept_id = hit.get('property:identifier')
                concept = concepts.get(concept_id)
                if concept is not None:
                    concept_uuid = concept.get('id')
                    hit['property:concept_id'] = concept_uuid

                    identifier = concept.get('property:identifier')
                    name = concept.get('property:name', '???')
                    sources = concept.get('property:externalSources',
                                          'json:[]')
                    if sources.startswith('json:'):
                        sources = json.loads(sources[5:])
                    if 'property:stdInchiKey' in concept:
                        sources.append({
                            'database':
                            ' InchiKey',
                            'identifier':
                            concept['property:stdInchiKey']
                        })
                    if 'property:canonicalSmiles' in concept:
                        sources.append({
                            'database':
                            ' SMILES',
                            'identifier':
                            concept['property:canonicalSmiles']
                        })
                    kind = concept.get('property:kind')
                    kind = self.dbs.get(kind, {}).get('title', kind)
                    links = {}
                    for source in sources:
                        uri = source.get('uri')
                        if 'primary' in source.get('relationship', []):
                            links.setdefault('definition', [])
                            links['definition'].append(u'''
                                <a href="{uri}" title="{uri}">{database}</a>
                            '''.format(**source))
                        elif uri is None:
                            if source.get('database') in (' InchiKey',
                                                          ' SMILES'):
                                links.setdefault('main', [])
                                links['main'].append(u'''
                                    <tr><td>{database}:</td><td>{identifier}</td></tr>
                                '''.format(**source))
                        else:
                            identifier = source.get('identifier')
                            links_category = 'xref'
                            if 'seeAlso' in source.get('relationship',
                                                       []) or uri is None:
                                links_category = 'seeAlso'
                            links.setdefault(links_category, [])
                            if identifier is not None:
                                links[links_category].append(u'''
                                    <a href="{uri}" title="{uri}">{name}...</a> ({identifier})
                                '''.format(**source))
                            else:
                                links[links_category].append(u'''
                                    <a href="{uri}" title="{uri}">{name}...</a>
                                '''.format(**source))

                    style = u'''
                        <style>
                          .lazarus-table tbody {
                            border: none;
                          }
                          .lazarus-table td:first-of-type {
                            text-align: right;
                            font-weight: bold;
                          }
                          .lazarus-table td {
                            vertical-align: top;
                          }
                          .lazarus-table td:first-of-type {
                            white-space: nowrap;
                          }
                          .lazarus-table td:not(:first-of-type) {
                            word-break: break-all;
                          }
                          .lazarus-table tr td {
                            padding-top: 0ex;
                            padding-bottom: 0ex;
                          }
                          .lazarus-table tbody:not(:first-of-type) tr:first-of-type td {
                            padding-top: 1ex;
                          }
                        </style>
                    '''
                    html = u'''
                        <table class="lazarus-table">
                          <tr><td>Name:</td><td>{name}</td></tr>
                    '''.format(**{'name': name})
                    categories = {
                        'xref': 'Related:',
                        'seeAlso': 'See also:',
                        'definition': 'Defined in:'
                    }
                    for links_category in ('main', 'xref', 'seeAlso',
                                           'definition'):
                        links_title = categories.get(links_category)
                        these_links = sorted(
                            list(set(links.get(links_category, []))))
                        if len(these_links) > 0:
                            html += '<tbody>'
                            if links_category != 'main':
                                html += u'<tr><td>{0}</td><td>'.format(
                                    links_title)
                                html += u'<br>'.join(these_links)
                                html += '</td></tr>'
                            else:
                                html += ''.join(these_links)
                            html += '</tbody>'
                    #pprint('------------------------')
                    html += u'''
                        </table>
                    '''
                    #print(html)

                    hasLinks = len(
                        links.get('xref', []) + links.get('seeAlso', [])) > 0

                    ann = spineapi.Annotation()
                    ann['concept'] = 'Collated'
                    ann['property:name'] = u'{0}'.format(name)
                    ann['property:description'] = 'Lazarus Concept'
                    ann['session:semanticTerm'] = name
                    ann['property:html'] = [style, html]
                    ann['property:sourceDescription'] = self.sourceDescription
                    ann['property:sourceIcon'] = utopia.get_plugin_data_as_url(
                        'images/lazarus-prefs-logo.png', 'image/png')
                    ann['session:overlay'] = 'hyperlink'
                    ann['session:color'] = '#880000'
                    count = 0
                    print('====', 7)
                    if 'property:hitFragments' in hit:
                        hitFragments = hit.getAllProperties(
                            'property:hitFragments') or []
                        #pprint(hitFragments)
                        for hitFragment in hitFragments:
                            pre, _, rest = hitFragment.partition('{!')
                            match, _, post = rest.partition('!}')
                            #pprint((pre, match, post))
                            matches = document.findInContext(pre,
                                                             match,
                                                             post,
                                                             fuzzy=True)
                            count += len(matches)
                            ann.addExtents(matches)
                    if hasLinks and count > 0:
                        document.addAnnotation(ann)

            style = u'''
                <style>
                    .lazarus-expression .box {
                        background-color: #FFF0E8;
                        border-color: #EEE0D8;
                    }
                    .lazarus-related {
                        padding-left: 42px;
                        background-image: url(%s);
                        background-repeat: no-repeat;
                        background-position: top left;
                        background-size: 37px 48px;
                        min-height: 53px;
                    }
                    .lazarus-related + .lazarus-related {
                        margin-top: 5px;
                        border-top: 1px dotted #aaa;
                        padding-top: 5px;
                        background-position-y: 5px;
                        min-height: 58px;
                    }
                    .lazarus-sentence {
                        padding-left: 0.5em;
                        color: black;
                    }
                    .lazarus-sentence.negative {
                        border-left: solid 5px #bb0000;
                    }
                    .lazarus-sentence.positive {
                        border-left: solid 5px #008800;
                    }
                    .lazarus-sentence.negative a {
                        color: #bb0000;
                    }
                    .lazarus-sentence.positive a {
                        color: #008800;
                    }
                </style>
            ''' % utopia.get_plugin_data_as_url('images/pdf-page-icon.png',
                                                'image/png')

            expressions = []
            for sAnnotation in expression_annotations:
                exp = sAnnotation.get('property:expressions', 'json:{}')
                if exp.startswith('json:'):
                    exp = json.loads(exp[5:])
                context = sAnnotation.get('property:context')
                if context is not None:
                    if exp.get('negative', False):
                        exp['posneg'] = 'negative'
                    else:
                        exp['posneg'] = 'positive'

                    pprint(context)
                    pprint(exp)

                    matched_context = exp.get('context')
                    matches = []
                    if matched_context is not None:
                        matches = document.search(
                            re.sub(r'\s+', ' ', matched_context))
                        if len(matches) > 0:
                            anchor_id = str(uuid.uuid4())[1:-1]
                            anchor = spineapi.Annotation()
                            anchor['concept'] = 'Anchor'
                            anchor['property:anchor'] = anchor_id
                            anchor.addExtents(matches)
                            document.addAnnotation(anchor)

                            exp.update({
                                'anchor_id': anchor_id,
                                'sentence': context
                            })
                            expressions.append(exp)

            js = u'''
                <script>
                    $(document).on('DOMNodeInserted', function(e) {
                        var element = e.target;
                        $(element).filter('a[target="tab"]').add('a[target="tab"]', element).each(function () {
                            var fragment = $(this).closest('.-papyro-internal-citation').data('citation')['userdef']['first_fragment'];
                            $(this).attr('target', 'pdf; show=highlight; text=[' + encodeURIComponent(fragment) + ']');
                        });
                    });

                    $(function () {
                        var lazarus = {
                            expressions: %s,
                            fingerprints: %s,
                            relUrl: %s
                        };

                        var more_expressions_link = $('#lazarus-expression > p.more').hide();
                        var more_expressions_spinner = $('#lazarus-expression > div.spinner');

                        Spinners.create(more_expressions_spinner);
                        Spinners.play(more_expressions_spinner);

                        var exp_divs = [];
                        var identifiers = [];
                        for (var e = 0; e < lazarus.expressions.length; e++) {
                            var expression = lazarus.expressions[e];
                            var exp_div = $('<div class="box"></div>');
                            exp_div.data('expression', expression);
                            exp_div.hide();
                            exp_divs.push(exp_div);
                            identifiers.push(expression.identifiers);
                        }
                        var params = {
                            fingerprint: lazarus.fingerprints
                        };
                        var url = lazarus.relUrl + '?' + $.param(params, traditional=true);
                        $.ajax({
                            url: url,
                            type: 'POST',
                            dataType: 'json',
                            data: JSON.stringify(identifiers),
                            contentType: "application/json",
                            error: function (xhr, ajaxOptions, thrownError) {
                                console.log(xhr.statusText);
                                console.log(xhr.responseText);
                                console.log(xhr.status);
                                console.log(thrownError);

                                // FIXME do something here
                                Spinners.remove(more_expressions_spinner);
                            },
                            success: function (related) {
                                // Sort related according to the number of articles found
                                related.results.sort(function (l, r) {
                                    var lv = Object.keys(l.related).length;
                                    var rv = Object.keys(r.related).length;
                                    return (lv > rv) ? -1 : (lv < rv) ? 1 : 0;
                                });
                                $.each(related.results, function (idx, result) {
                                    var exp_div = exp_divs[idx];
                                    var expression = exp_div.data('expression');
                                    expression.related = result.related;
                                    delete expression.related[%s];

                                    split = expression.sentence.split(expression.context);
                                    pre = split[0];
                                    pre = pre.replace(/(\w)$/, '$1 ');
                                    pre = pre.replace(/^\s*/, '');
                                    match = expression.context;
                                    post = split[1];
                                    post = post.replace(/^(\w)/, ' $1');
                                    post = post.replace(/\s*$/, '');
                                    expression.pre = pre;
                                    expression.match = match;
                                    expression.post = post;

                                    // Create expression element
                                    exp_div.append('<p class="lazarus-sentence ' + expression.posneg + '">&ldquo;' + expression.pre + '<a target="pdf; show=select; anchor=' + expression.anchor_id + '"><strong>' + expression.match + '</strong></a>' + expression.post + '&rdquo;</p>');
                                    exp_div.data('expression', expression);

                                    $('#lazarus-expression > .content').append(exp_div);

                                    if (Object.keys(expression.related).length > 0) {
                                        var related_div = $('<div class="expandable" title="Related expressions elsewhere"></div>');
                                        var related_div_content = $('<div></div>').appendTo(related_div);
                                        function on_expand() {
                                            related_div.off('papyro:expandable:expand', on_expand);
                                            $.each(expression.related, function (idx, obj) {
                                                fragments = [];
                                                $.each(obj, function (id, obj) {
                                                    fragments.push(obj.context);
                                                });
                                                fragments.join('\\n');
                                                related_div_content.append($('<div class="lazarus-related unprocessed"></div>').append('<p><strong>&ldquo;&hellip;'+fragments+'&hellip;&rdquo;</strong></p>').hide().data('citation', {identifiers:{doi:idx},userdef:{first_fragment:fragments[0]}}));
                                                // .append(utopia.citation.render({identifiers:{doi:idx},first_fragment:fragments[0]}, true, true))
                                            });
                                            expression.related.length = 0; // empty for future

                                            if ($('.lazarus-related.unprocessed', exp_div).length > 0) {
                                                var more = $('<p class="more right"><a class="more">More related articles...</a></p>');
                                                related_div_content.append(more);
                                                function show_five_related(e) {
                                                    e.preventDefault();

                                                    $('.lazarus-related.unprocessed', exp_div).slice(0, 5).each(function (idx, obj) {
                                                        var citation = $(obj).data('citation');
                                                        $(obj).append(utopia.citation.render(citation, true, true));
                                                        $(obj).show().removeClass('unprocessed');
                                                    });
                                                    if ($('.lazarus-related.unprocessed', exp_div).length == 0) {
                                                        more.remove();
                                                    }
                                                }
                                                more.on('click', show_five_related).click();
                                            }
                                        }
                                        related_div.on('papyro:expandable:expand', on_expand);
                                        exp_div.append(related_div);
                                        utopia.processNewContent(related_div);
                                    }
                                });

                                Spinners.remove(more_expressions_spinner);
                                more_expressions_link.show();
                                $('a.more', more_expressions_link).click();
                            }
                        });

                        function append_five(e) {
                            e.preventDefault();

                            // Show the next five
                            $('#lazarus-expression > .content').children().filter(':hidden').slice(0,5).show();

                            // Hide the 'more' link if everything is now visible
                            if ($('#lazarus-expression > .content').children().filter(':hidden').length == 0) {
                                more_expressions_link.hide();
                            }
                        }

                        // Hook up 'more' link
                        $('#lazarus-expression > p.more > a.more').on('click', append_five).click();
                    });
                </script>
            ''' % (json.dumps(expressions), json.dumps(
                document.fingerprints()), json.dumps(laz_docRelUrl),
                   json.dumps(this_doi))
            #print(js.encode('utf8'))

            html = u'''
                <div id="lazarus-expression"><div class="content"></div><div class="spinner"></div><p class="more"><a class="more">More expressions...</a></p></div>
            '''

            if len(expressions) > 0:
                ann = spineapi.Annotation()
                ann['concept'] = 'Collated'
                ann['property:name'] = 'Lazarus Expressions'
                ann['property:description'] = u'Summarizing expression(s)'
                ann['property:html'] = [js, style, html]
                ann['property:sourceDescription'] = self.sourceDescription
                ann['property:sourceIcon'] = utopia.get_plugin_data_as_url(
                    'images/lazarus-prefs-logo.png', 'image/png')
                document.addAnnotation(ann)

        else:  # no permission
            noprompt = self.get_config('noprompt', False)
            if not noprompt:
                annotation = spineapi.Annotation()
                annotation['concept'] = 'Collated'
                params = {
                    'uuid': self.uuid(),
                }
                annotation['property:html'] = utopia.get_plugin_data(
                    'tpl/denied.html').format(**params)
                annotation['property:name'] = 'Lazarus'
                annotation[
                    'property:description'] = 'Lazarus functionality is turned off'
                annotation[
                    'property:sourceDescription'] = self.sourceDescription
                annotation[
                    'property:sourceIcon'] = utopia.get_plugin_data_as_url(
                        'images/lazarus-prefs-logo.png', 'image/png')
                annotation['session:default'] = '1'
                document.addAnnotation(annotation)
    def on_ready_event(self, document):

        doi = common.utils.metadata(document, 'doi')
        if doi is not None:
            info = {}

            # Resolve the DOI to find the publisher's website
            response = urllib2.urlopen('http://dx.doi.org/{0}'.format(doi), timeout=8)

            # Parse page to find (if there) the full text URL
            parser = etree.HTMLParser()
            html = etree.parse(response, parser)

            # Only continue if this is a highwire HTML page
            if len(html.xpath("/html/head/meta[@name='HW.identifier']")) == 0:
                return

            # Now make sure we have the full text XHTML
            citation_fulltext_html_url = html.xpath("/html/head/meta[@name='citation_fulltext_html_url']/@content")
            if len(citation_fulltext_html_url) > 0:
                citation_fulltext_html_url = citation_fulltext_html_url[0]

                # Fetch that full text page (if different to the current one)
                if citation_fulltext_html_url != response.geturl():
                    response = urllib2.urlopen(citation_fulltext_html_url, timeout=8)
                    html = etree.parse(response, parser)

                #print etree.tostring(html, pretty_print=True, encoding='utf8')

                # Now parse out the bibliography
                info['citations'] = []
                info['citations_by_id'] = {}

                for bibitem in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' ref-list ')]//ol[contains(concat(' ', normalize-space(@class), ' '), ' cit-list ')]/li"):
                    citation = query(bibitem, {
                        'id': 'a/@id',
                        'label': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' ref-label ')]/text()",
                        'title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-article-title ')]/text()",
                        'year': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-date ')]/text()",
                        'publication-title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-jnl-abbrev ')]/text()",
                        'volume': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-vol ')]/text()",
                        'issue': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-issue ')]/text()",
                        'pagefrom': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-fpage ')]/text()",
                        'pageto': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-lpage ')]/text()",
                        'pmid': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-pmid ')]/text()",
                        'doi': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-doi ')]/text()",
                        'etree': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-metadata ')]",
                    })
                    authors = []
                    for a in bibitem.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-auth ')]"):
                        surname = a.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-surname ')]/text()")
                        given_names = a.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-given-names ')]/text()")
                        if len(surname) > 0 and len(given_names) > 0:
                            authors.append(u'{0}, {1}'.format(surname[0], given_names[0]).strip(', '))
                    if len(authors) > 0:
                        citation['authors'] = authors
                    citation['contexts'] = []
                    citation['displayText'] = common.utils.format_citation(citation)

                    info['citations'].append(citation)
                    info['citations_by_id'][citation['id']] = citation
                    #print citation


                #######################################################################################
                # Parse in-text citations if present

                min_length = 10
                max_length = 20
                for paragraph in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' article ')]//p"):
                    text_stack = [paragraph.text or '']
                    xref_stack = [None]
                    for elem in paragraph:
                        if len(elem.xpath('self::a[@class="xref-bibr"]')) > 0:
                            text_stack.append(etree.tostring(elem, method='text', encoding=unicode, with_tail=False))
                            text_stack.append(elem.tail or '')
                            xref = info['citations_by_id'].get(elem.get('href', '')[1:])
                            if xref is not None:
                                xref_stack += [[xref], None]
                            else:
                                xref_stack += [[], None]
                        elif isinstance(elem, etree._Entity):
                            points = entities.get(elem.text[1:-1])
                            if points is not None:
                                text_stack[-1] += ''.join((unichr(p) for p in points))
                            else:
                                text_stack[-1] += etree.tostring(elem, encoding=unicode)
                        else:
                            if elem.get('position') == 'float':
                                text_stack[-1] += elem.tail or ''
                            else:
                                text_stack[-1] += etree.tostring(elem, method='text', encoding=unicode)
                    # Find and collapse ranges in the text
                    for i in xrange(len(xref_stack) - 3, 1, -2):
                        text = text_stack[i].strip()
                        #print len(text), (text in u'-\u2010\u2011\u2012\u2013\u2014\u2015'), u''.join(text_stack[i-1:i+2]).encode('utf8')
                        # if this text is a dash, we need to coalesce the text fragments
                        if len(text) == 1 and text in u'-\u2010\u2011\u2012\u2013\u2014\u2015':
                            text_stack[i-1:i+2] = [u''.join(text_stack[i-1:i+2])]
                            xref_stack[i-1:i+2] = [xref_stack[i-1] + xref_stack[i+1]]
                    #for text in text_stack:
                    #    print text.encode('utf8')
                    # Then make sure we resolve the implied citations
                    for i in xrange(1, len(xref_stack), 2):
                        # Get actual cross references
                        xrefs = xref_stack[i]

                        # Expand cross references
                        try:
                            if len(xrefs) == 2:
                                labelfrom = int(xrefs[0].get('label'))
                                labelto = int(xrefs[1].get('label'))
                                candidates = {}
                                midlabels = [unicode(midlabel) for midlabel in xrange(labelfrom+1, labelto)]
                                for candidate in info['citations']:
                                    if candidate.get('label') in midlabels:
                                        candidates[int(candidate.get('label'))] = candidate
                                xrefs[1:-1] = candidates.values()
                        except:
                            raise
                    # Find and collapse lists in the text
                    for i in xrange(len(xref_stack) - 3, 1, -2):
                        text = text_stack[i].strip()
                        # if this text is a comma, we need to coalesce the text fragments
                        if len(text) == 1 and text == ',':
                            text_stack[i-1:i+2] = [u''.join(text_stack[i-1:i+2])]
                            xref_stack[i-1:i+2] = [xref_stack[i-1] + xref_stack[i+1]]
                    # Expand citations to include brackets (on both sides)
                    for i in xrange(len(xref_stack) - 2, 0, -2):
                        before = text_stack[i-1].strip()[-1:]
                        text = text_stack[i].strip()
                        after = text_stack[i+1].strip()[:1]
                        # if this text is a comma, we need to coalesce the text fragments
                        #print before.encode('utf'), after.encode('utf')
                        if len(before) > 0 and before in '({[' and len(after) > 0 and after in ')}]':
                            text_stack[i-1] = re.sub(r'[({[](\s*)$', r'\1', text_stack[i-1])
                            text_stack[i+1] = re.sub(r'^(\s*)[)}\]]', r'\1', text_stack[i+1])
                            text_stack[i] = before + text_stack[i] + after
                    #print repr(text_stack)
                    for i in xrange(1, len(xref_stack), 2):
                        # Get context
                        before = u' '.join(text_stack[:i]).strip()
                        label = text_stack[i].strip()
                        after = u' '.join(text_stack[i+1:]).strip()
                        # Strip out extraneous brackets
                        if len(xref_stack[i]) > 1: # Hack to differentiate single / multiple citations
                                                   # as multiple numbers tend not to have spaces between them
                            label = re.sub(ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,])\s?[({[]?', r'\1', label)
                        else:
                            label = re.sub(ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,]\s?)[({[]?', r'\1', label)
                        # Normalise context
                        before = re.sub(r'\s+', ' ', before)[-max_length:].strip()
                        label = re.sub(r'\s+', ' ', label)
                        after = re.sub(r'\s+', ' ', after)[:max_length].strip()
                        #print (before.encode('utf8'), label.encode('utf8'), after.encode('utf8'))
                        if len(before + after) > min_length:
                            for xref in xref_stack[i]:
                                xref['contexts'].append((before, label, after))
                        #print xref_stack[i]

                #######################################################################################
                # Parse tables if present

                info['tables'] = {}
                for table_url in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' table-inline ')]/div[contains(concat(' ', normalize-space(@class), ' '), ' callout ')]//li[1]/a/@href"):
                    table_url = urlparse.urljoin(citation_fulltext_html_url, table_url)
                    #print table_url
                    response = urllib2.urlopen(table_url, timeout=8)
                    table_html = etree.parse(response, parser)
                    for table_expansion in table_html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' table-expansion ')]"):
                        id = table_expansion.get('id')
                        table = {}
                        table['xml'] = table_expansion.xpath('.//table[1]')[0]
                        table['caption_raw'] = table_expansion.xpath(".//span[contains(concat(' ', normalize-space(@class), ' '), ' caption-title ')][1]")[0]
                        if 'caption' not in table and 'caption_raw' in table:
                            table['caption'] = table['caption_raw']
                        if 'caption' in table:
                            table['caption'] = re.sub(r'\s+', ' ', etree.tostring(table['caption'], method='text', encoding=unicode).strip())
                        if 'xml' in table: table['xml'] = etree.tostring(table['xml'], encoding='utf8')
                        info['tables'][id] = table

                        #print table









            #print info
            if info is not None and len(info) > 0:
                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8')
                pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation))
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser)
                    for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'):
                        #print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info['citations_by_id'][pmids[pmid]]
                            for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')):
                                id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    #print 'KEY', key_name, id

                # Generate sensible titles / descriptions / icons?
                journalTitle = info.get('publication-title', '')
                journalTitleSuffix = ''
                publisher = info.get('publisher', 'the publisher')
                if len(journalTitle) > 0:
                    journalTitleSuffix = ' ({0})'.format(journalTitle)

                # Create Metadata link annotation
                link = document.newAccList('metadata', 90)
                link['property:sourceIcon'] = ''
                link['property:sourceTitle'] = publisher
                link['property:sourceDescription'] = '''
                    <p>This information was provided by {0}{1}.</p>
                    '''.format(publisher, journalTitleSuffix)

                # Create Metadata annotation
                annotation = spineapi.Annotation()
                annotation['concept'] = 'DocumentMetadata'
                for k in self.keys:
                    v = info.get(k)
                    if v is not None:
                        annotation['property:{0}'.format(k)] = v
                document.addAnnotation(annotation, link['scratch'])

                # Create Bibliography annotations
                for citation in info.get('citations', []):
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'DocumentReference'
                    for k in self.keys:
                        v = citation.get(k)
                        if v is not None:
                            annotation['property:{0}'.format(k)] = v
                    document.addAnnotation(annotation, link['scratch'])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info['citations']:
                    # Find cross refs
                    for pre, label, post in citation.get('contexts', []):
                        #print (pre, label, post)
                        matches = document.findInContext(pre, label, post)
                        #print matches
                        if len(matches) > 0:
                            try:
                                annotation = spineapi.Annotation()
                                annotation['concept'] = 'ForwardCitation'
                                annotation['property:state'] = 'found'
                                if 'title' in citation:
                                    annotation['property:title'] = citation['title']
                                if 'id' in citation:
                                    annotation['property:bibid'] = citation['id']
                                if 'doi' in citation and citation['doi'].startswith('10.1371/'):
                                    citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi']))
                                if 'pmcid' in citation:
                                    citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid'])
                                for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'):
                                    if k in citation:
                                        annotation['property:{0}'.format(k)] = citation[k]
                                #print annotation.get('property:label'), annotation.get('property:pdf')
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation, link['scratch'])
                                #print citation
                            except:
                                raise
                                pass # FIXME

                for id, table in info.get('tables', {}).iteritems():
                    if 'caption' in table and 'xml' in table:
                        regex = fuzz(table['caption'], strict = True)
                        #print regex

                        # convert oasis tables
                        ns = {'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table'}
                        xml = etree.fromstring(table['xml'])
                        if xml.tag == '{{{0}}}table'.format(ns['oasis']):
                            for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns):
                                columns = {}
                                for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns):
                                    columns[colspec.get('colname')] = int(colspec.get('colnum'))
                                for section in tgroup.xpath('./oasis:thead|./oasis:tbody', namespaces=ns):
                                    isHead = (section.tag == '{{{0}}}thead'.format(ns['oasis']))
                                    for row in section.xpath('./oasis:row', namespaces=ns):
                                        for entry in row.xpath('./oasis:entry', namespaces=ns):
                                            colname = entry.get('colname')
                                            colst = entry.get('namest')
                                            colend = entry.get('nameend')
                                            if colst is not None and colend is not None:
                                                colspan = columns[colend] - columns[colst] + 1
                                            else:
                                                colspan = 1
                                            if colspan > 1:
                                                entry.set('colspan', unicode(colspan))
                                            morerows = entry.get('morerows')
                                            if morerows is not None:
                                                rowspan = int(morerows) + 1
                                            else:
                                                rowspan = 1
                                            if rowspan > 1:
                                                entry.set('rowspan', unicode(rowspan))
                                            entry.tag = 'td'
                                        row.tag = 'tr'
                                    if isHead:
                                        section.tag = 'thead'
                                    else:
                                        section.tag = 'tbody'
                                    xml.append(section)
                                xml.tag = 'table'
                                #print etree.tostring(xml, pretty_print=True, encoding='utf8')
                                table['xml'] = etree.tostring(xml, encoding='utf8')

                        matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation['concept'] = 'Table'
                            annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml'])
                            annotation['session:volatile'] = '1'
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link['scratch'])
                        else:
                            print '*********** failed to match table:', id
    def on_ready_event(self, document):

        # Only send if the DOI has a Springer prefix
        doi = utopialib.utils.metadata(document, 'identifiers[doi]')
        if doi is not None and doi[:7] in registrants:

            annotation = spineapi.Annotation()
            annotation['concept'] = 'PublisherIdentity'
            if False and doi.startswith(
                    '10.1186/'):  # This turns out not to be reliable
                annotation['property:logo'] = utopia.get_plugin_data_as_url(
                    'images/gigascience_logo.png', 'image/png')
                annotation['property:title'] = 'Giga Science'
                annotation[
                    'property:webpageUrl'] = 'http://www.gigasciencejournal.com/'
            else:
                annotation['property:logo'] = utopia.get_plugin_data_as_url(
                    'images/logo.png', 'image/png')
                annotation['property:title'] = 'Springer'
                annotation['property:webpageUrl'] = 'http://www.springer.com/'
            document.addAnnotation(annotation, 'PublisherMetadata')

            # Make a request to the utopia ext web service
            url = 'https://utopia.cs.manchester.ac.uk/ext/springer/nlm?{0}'
            url = url.format(urllib.urlencode({'doi': doi}))
            try:
                nlm = urllib2.urlopen(url, timeout=8).read()
            except (urllib2.URLError, socket.timeout):
                return

            info = utopialib.nlm.parse(nlm)
            if info is not None and len(info) > 0:

                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True,
                                         recover=True,
                                         remove_blank_text=True,
                                         encoding='utf8')
                pmids = dict(((citation['pmid'], citation['id'])
                              for citation in info['citations']
                              if 'pmid' in citation and 'id' in citation))
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(
                        utopialib.eutils.efetch(id=','.join(pmids.keys()),
                                                retmode='xml',
                                                rettype='abstract'), parser)
                    for idList in pubmed_abstracts.xpath(
                            'PubmedArticle/PubmedData/ArticleIdList'):
                        #print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info['citations_by_id'][pmids[pmid]]
                            for key_name, id_name in (('doi', 'doi'), ('pmcid',
                                                                       'pmc'),
                                                      ('pii', 'pii')):
                                id = idList.findtext(
                                    'ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    #print 'KEY', key_name, id

                # Create Metadata link annotation
                link = document.newAccList('metadata', 100)
                link['property:sourceIcon'] = utopia.get_plugin_data_as_url(
                    'images/annotation_icon.png', 'image/png')
                link['property:sourceTitle'] = 'Springer'
                link['property:sourceDescription'] = '''
                    <p><a href="http://www.springer.com/">Springer</a> publishing company.</p>
                    '''

                # Create Metadata annotation
                annotation = utopialib.utils.citation_to_annotation(
                    info.get('self', {}), 'DocumentMetadata')
                document.addAnnotation(annotation, link['scratch'])

                # Create Bibliography annotations
                for citation in info.get('citations', []):
                    annotation = utopialib.utils.citation_to_annotation(
                        citation)
                    document.addAnnotation(annotation, link['scratch'])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info['citations']:
                    # Find cross refs
                    for pre, label, post in citation.get('contexts', []):
                        matches = document.findInContext(pre, label, post)
                        #print matches
                        if len(matches) > 0:
                            try:
                                annotation = utopialib.utils.citation_to_annotation(
                                    citation, concept='ForwardCitation')
                                if 'doi' in citation and citation[
                                        'doi'].startswith('10.1371/'):
                                    citation[
                                        'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format(
                                            'info:doi/{0}'.format(
                                                citation['doi']))
                                if 'pmcid' in citation:
                                    citation[
                                        'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(
                                            citation['pmcid'])
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation,
                                                       link['scratch'])
                            except:
                                raise

                for id, table in info.get('tables', {}).iteritems():
                    if 'caption' in table and 'xml' in table:
                        regex = fuzz(table['caption'], strict=True)
                        print regex
                        matches = document.search(
                            regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation['concept'] = 'Table'
                            annotation[
                                'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(
                                    table['xml'])
                            annotation['session:volatile'] = '1'
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link['scratch'])
                        else:
                            print '*********** failed to match table:', id
 def on_filter_event(self, document, data = None):
     for a in document.annotations():
         if a.get('author') == 'http://utopia.cs.manchester.ac.uk/users/11679' and a.get('concept') in ('Definition', 'DatabaseEntry') and 'session:legacy' not in a:
             document.removeAnnotation(a)
             identifier = a.get('property:identifier', '')
             if identifier.startswith('http://bio2rdf.org/pdb:'):
                 # PDB entry
                 a2 = spineapi.Annotation()
                 a2['concept'] = 'DatabaseEntry'
                 a2['author'] = a['author']
                 a2['session:volatile'] = '1'
                 a2['session:legacy'] = '1'
                 a2['property:sourceDatabase'] = 'pdb'
                 a2['property:sourceDescription'] = '<p>The <a href="http://www.rcsb.org/">Protein Data Bank</a> of the Research Collaboratory for Structural Bioinformatics (<a href="http://home.rcsb.org/">RCSB</a>).</p>'
                 a2['property:identifier'] = identifier
                 a2['property:description'] = 'PDB entry {0}'.format(identifier[-4:].upper())
                 if 'property:name' in a:
                     a2['property:name'] = a['property:name'][:-11]
                 if 'property:imageUrl' in a:
                     a2['property:imageUrl'] = a['property:imageUrl']
                 if 'property:molecularDescription' in a:
                     a2['property:molecularDescription'] = a['property:molecularDescription']
                 if 'property:webpageUrl' in a:
                     a2['property:webpageUrl'] = a['property:webpageUrl']
                 if 'property:embedded' in a:
                     a2['property:embedded'] = a['property:embedded']
                 for extent in a.extents():
                     a2.addExtent(extent)
                 for area in a.areas():
                     a2.addArea(area)
                 document.addAnnotation(a2)
             if identifier.startswith('http://dbpedia.org/resource/'):
                 # Wikipedia entry
                 a2 = spineapi.Annotation()
                 a2['concept'] = 'Definition'
                 a2['author'] = a['author']
                 a2['session:volatile'] = '1'
                 a2['session:legacy'] = '1'
                 a2['property:sourceDatabase'] = 'wikipedia'
                 a2['property:sourceDescription'] = '<p>Structured <a href="http://www.wikipedia.org/">Wikipedia</a> information provided by the <a href="http://DBpedia.org/">DBpedia</a> project.</p>'
                 a2['property:description'] = a.get('property:summary', 'Wikipedia entry')
                 if 'property:name' in a:
                     a2['property:name'] = a['property:name']
                 if 'property:identifier' in a:
                     a2['property:identifier'] = a['property:identifier']
                 if 'property:imageUrl' in a:
                     a2['property:imageUrl'] = a['property:imageUrl']
                 if 'property:summary' in a:
                     a2['property:summary'] = a['property:summary']
                 if 'property:webpageUrl' in a:
                     a2['property:webpageUrl'] = a['property:webpageUrl']
                 for extent in a.extents():
                     a2.addExtent(extent)
                 for area in a.areas():
                     a2.addArea(area)
                 document.addAnnotation(a2)
             if identifier.startswith('http://www.portlandpress.com/utopia/glick/'):
                 # Wikipedia entry
                 a2 = spineapi.Annotation()
                 a2['concept'] = 'Definition'
                 a2['author'] = a['author']
                 a2['session:volatile'] = '1'
                 a2['session:legacy'] = '1'
                 a2['property:sourceDatabase'] = 'glick'
                 a2['property:sourceDescription'] = '<p>David M. Glick\'s <a href="http://www.portlandpress.com/pp/books/online/glick/search.htm">Glossary of Biochemistry and Molecular Biology</a>.</p><p>Made available by <a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p>'
                 a2['property:description'] = a['property:description'] + '<p><em>(Glick Glossary/Portland Press Ltd.)</em></p>'
                 a2['property:name'] = a['property:name']
                 for extent in a.extents():
                     a2.addExtent(extent)
                 for area in a.areas():
                     a2.addArea(area)
                 document.addAnnotation(a2)