def on_activate_event(self, document):

        regex = r'(functional abnormalities)'
        # Scan the document for some regular expression
        matches = document.search(regex,
            spineapi.RegExp + spineapi.WholeWordsOnly)

        to_add = {} # Dictionary of annotations to add

        for match in matches:
            # Sanitise matches found in document for dict keys
            match_text = match.text().lower().strip()
            match_text_quoted = urllib.quote(match_text)

            # Has same text already been annotated?
            annotation = to_add.get(match_text, None)

            if annotation is None:
                # If no, create new annotation
                annotation = spineapi.Annotation()
                annotation['concept'] = 'NeuroSynthAnnotation'
                annotation['property:name'] = match_text
                annotation['property:description'] = 'Link to NeuroSynth'
                annotation['property:webpageUrl'] = 'http://beta.neurosynth.org/features/{0}/'.format(match_text_quoted)
                annotation['session:overlay'] = 'hyperlink'
                annotation['session:color'] = '#00AA00' # Green
                to_add[match_text] = annotation

            if annotation is not None:
                # Add the match to the annotation, in any case
                annotation.addExtent(match)

            # Finally, add the annotations to the document
            document.addAnnotations(to_add.values())
Example #2
0
    def on_activate_event(self, document):
        text = document.text().encode('utf8')
        text_hash = hashlib.md5(text).hexdigest()

        url = 'http://beta.sciencewise.info/api/utopia'
        payload = urllib.urlencode({'text': text, 'chksum': text_hash})
        response = urllib2.urlopen(url, payload, timeout=8).read()
        results = json.loads(response)
        annotations = []

        for result in results:
            before = result.get('context', {}).get('before', '')
            term = result.get('value', '')
            after = result.get('context', {}).get('after', '')
            link = result.get('link')
            definitions = []
            for definition in result.get('definitions', []):
                definitions.append(
                    (definition.get('url'), definition.get('title')))

            if len(term) > 0 and len(before) + len(term) + len(
                    after) > 0 and link is not None:
                matches = document.findInContext(before, term, after)
                if len(matches) > 0:
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'ScienceWISE'
                    annotation['property:webpageUrl'] = link
                    annotation['property:term'] = term
                    annotation['property:name'] = 'Definitions of {0}'.format(
                        term)
                    annotation[
                        'property:description'] = 'ScienceWISE ontology definitions'
                    annotation['property:sourceDatabase'] = 'sciencewise'
                    annotation[
                        'property:sourceDescription'] = '<p><a href="http://sciencewise.info/">ScienceWISE</a> provides phycists with article annotation and scientific bookmarking.</p>'
                    for url, title in definitions:
                        annotation.insertProperty('property:definitions',
                                                  '{0} {1}'.format(url, title))
                    for match in matches:
                        annotation.addExtent(match)
                    annotations.append(annotation)

        if len(annotations) > 0:
            document.addAnnotations(annotations)
    def on_activate_event(self, document):
        text = document.text().encode('utf8')
        text_hash = hashlib.md5(text).hexdigest()

        url = 'http://beta.sciencewise.info/api/utopia'
        payload = urllib.urlencode({ 'text': text, 'chksum': text_hash })
        response = urllib2.urlopen(url, payload, timeout=8).read()
        results = json.loads(response)
        annotations = []

        for result in results:
            before = result.get('context', {}).get('before', '')
            term = result.get('value', '')
            after = result.get('context', {}).get('after', '')
            link = result.get('link')
            definitions = []
            for definition in result.get('definitions', []):
                definitions.append((definition.get('url'), definition.get('title')))

            if len(term) > 0 and len(before) + len(term) + len(after) > 0 and link is not None:
                matches = document.findInContext(before, term, after)
                if len(matches) > 0:
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'ScienceWISE'
                    annotation['property:webpageUrl'] = link
                    annotation['property:term'] = term
                    annotation['property:name'] = 'Definitions of {0}'.format(term)
                    annotation['property:description'] = 'ScienceWISE ontology definitions'
                    annotation['property:sourceDatabase'] = 'sciencewise'
                    annotation['property:sourceDescription'] = '<p><a href="http://sciencewise.info/">ScienceWISE</a> provides phycists with article annotation and scientific bookmarking.</p>'
                    for url, title in definitions:
                        annotation.insertProperty('property:definitions', '{0} {1}'.format(url, title))
                    for match in matches:
                        annotation.addExtent(match)
                    annotations.append(annotation)

        if len(annotations) > 0:
            document.addAnnotations(annotations)
    def on_activate_event(self, document):
        ns = {'r': 'Reflect'}

        maxTextFragmentSize = 1000000
        textFragments = []
        seenItemNames = set()
        ignoredEntityTypes = [-11]

        # Retrieve the full text of the document, split into fragments
        for page in document.pages():
            pageText = re.sub(r'\s+', r' ', page.pageText())
            if len(textFragments) == 0 or len(textFragments[-1][0]) + len(
                    pageText) > maxTextFragmentSize:
                textFragments.append([pageText, page])
            else:
                textFragments[-1][0] = textFragments[-1][0] + ' ' + pageText

        for text, page in textFragments:
            # Package it as URL encoded form encoding
            payload = 'document=%s' % urllib.quote(text.encode('utf8'))
            # Send it off to the reflect server
            response = urllib2.urlopen("http://reflect.ws/REST/GetEntities",
                                       payload,
                                       timeout=8)
            # Parse response
            root = etree.fromstring(response.read(), self.parser)

            reflections = {}
            annotations = {}

            for item in root.xpath('//r:item', namespaces=ns):
                itemName = etree.tostring(item.find('{%s}name' % ns['r']),
                                          method="text",
                                          encoding=unicode,
                                          with_tail=False).lower().strip()
                if itemName not in seenItemNames:
                    for entity in item.xpath('.//r:entity', namespaces=ns):
                        entityType = entity.findtext('{%s}type' % ns['r'])
                        if entityType is not None:
                            entityType = int(entityType)
                        if entityType not in ignoredEntityTypes:
                            entityIdentifier = entity.findtext(
                                '{%s}identifier' % ns['r'])
                            if itemName not in reflections:
                                reflections[itemName] = set()
                            reflections[itemName].add(
                                (entityType, entityIdentifier))

            # For each match, create an annotation that the UI will handle later
            regex = '(%s)' % '|'.join(
                [re.escape(key) for key in reflections.iterkeys()])
            matches = document.search(regex,
                                      IgnoreCase + WholeWordsOnly + RegExp,
                                      start=page)
            for match in matches:
                if match.begin().wordArea()[1] == 0:
                    itemName = match.text().lower().strip()
                    annotation = annotations.get(itemName, None)
                    if annotation is None and itemName in reflections:
                        annotation = Annotation()
                        annotation['concept'] = 'Reflection'
                        annotation['property:webpageUrl'] = \
                            'http://reflect.ws/fcgi-bin/solveAmbig.fcgi?entities=%s' % \
                            ';'.join(['%d.%s' % (t, id) for (t, id) in reflections[itemName]])
                        annotation['property:name'] = itemName
                        annotation['session:overlay'] = 'hyperlink'
                        annotation['session:color'] = '#0A0'
                        annotations[itemName] = annotation
                        seenItemNames.add(itemName)
                    if annotation is not None:
                        annotation.addExtent(match)
                    else:
                        print "ERROR: matched '%s' but could not find in reflections map" % itemName.encode(
                            'utf8')
                        print reflections.keys()

            document.addAnnotations(annotations.values())
Example #5
0
    def after_ready_event(self, document):
        print 'Formatting metadata'

        # Find highest matching metadata accumulation list for references
        source = None
        for accListLink in document.getAccLists('metadata'):
            matches = document.annotationsIf({'concept': 'Citation'},
                                             accListLink['scratch'])
            if len(matches) > 0:
                print 'Selected for [Citation] list %s with rank %s' % (
                    accListLink['scratch'], repr(accListLink.get('rank', 0)))
                source = accListLink
                bibliography = list(matches)
                bibliography.sort(key=sortfn)
                rt = ''
                for annotation in bibliography:
                    citation = utopia.tools.utils.citation_from_annotation(
                        annotation)
                    rt += utopia.citation.render(citation, links=True)

                if len(bibliography) > 0:
                    # Create Metadata link annotation
                    link = document.newAccList('citation_list')
                    link['property:list_name'] = 'Bibliography'
                    document.addAnnotations(bibliography, link['scratch'])

                if len(rt) > 0:
                    references = spineapi.Annotation()
                    references['displayBibliography'] = rt
                    references['concept'] = 'BibliographyMetadata'
                    references['property:identifier'] = '#bibliography'
                    references['property:name'] = 'Bibliography'
                    references['displayName'] = 'Bibliography'
                    references['displayRelevance'] = '800'
                    if accListLink is not None:
                        for i in ('sourceIcon', 'sourceTitle',
                                  'sourceDescription', 'sourceDatabase'):
                            k = 'property:{0}'.format(i)
                            if k in accListLink:
                                references[k] = accListLink[k]
                        references[
                            'property:description'] = 'From ' + accListLink[
                                'property:sourceTitle']
                    document.addAnnotation(references)
                break
        if source is None:
            print 'No metadata found'

        # Find highest matching metadata accumulation list for in-text citations
        for accListLink in document.getAccLists('metadata'):
            matches = document.annotationsIf({'concept': 'ForwardCitation'},
                                             accListLink['scratch'])
            if len(matches) > 0:
                print 'Selected for [ForwardCitation] list %s with rank %s' % (
                    accListLink['scratch'], repr(accListLink.get('rank', 0)))
                document.addAnnotations(matches)
                break

        # Find highest matching metadata accumulation list for in-text citations
        for accListLink in document.getAccLists('metadata'):
            matches = document.annotationsIf({'concept': 'Table'},
                                             accListLink['scratch'])
            if len(matches) > 0:
                print 'Selected for [Table] list %s with rank %s' % (
                    accListLink['scratch'], repr(accListLink.get('rank', 0)))
                document.addAnnotations(matches)
                break

        metadata = None
        if source is not None:
            for annotation in document.annotations(source['scratch']):
                if annotation.get('concept') == 'DocumentMetadata':
                    metadata = annotation
            if metadata:
                metadata['displayName'] = 'Document Information'
                metadata['displayRelevance'] = '1000'
                document.addAnnotation(metadata, 'Document Metadata')
    def on_activate_event(self, document):
        ns = {'r': 'Reflect'}

        maxTextFragmentSize = 1000000
        textFragments = []
        seenItemNames = set()
        ignoredEntityTypes = [-11]

        # Retrieve the full text of the document, split into fragments
        for page in document.pages():
            pageText = re.sub(r'\s+', r' ', page.pageText())
            if len(textFragments) == 0 or len(textFragments[-1][0]) + len(pageText) > maxTextFragmentSize:
                textFragments.append([pageText, page])
            else:
                textFragments[-1][0] = textFragments[-1][0] + ' ' + pageText

        for text, page in textFragments:
            # Package it as URL encoded form encoding
            payload = 'document=%s' % urllib.quote(text.encode('utf8'))
            # Send it off to the reflect server
            response = urllib2.urlopen("http://reflect.ws/REST/GetEntities", payload, timeout=8)
            # Parse response
            root = etree.fromstring(response.read(), self.parser)

            reflections = {}
            annotations = {}

            for item in root.xpath('//r:item', namespaces = ns):
                itemName = item.findtext('{%s}name' % ns['r']).lower().strip()
                if itemName not in seenItemNames:
                    for entity in item.xpath('.//r:entity', namespaces = ns):
                        entityType = entity.findtext('{%s}type' % ns['r'])
                        if entityType is not None:
                            entityType = int(entityType)
                        if entityType not in ignoredEntityTypes:
                            entityIdentifier = entity.findtext('{%s}identifier' % ns['r'])
                            if itemName not in reflections:
                                reflections[itemName] = set()
                            reflections[itemName].add((entityType, entityIdentifier))

            # For each match, create an annotation that the UI will handle later
            regex = '(%s)' % '|'.join([re.escape(key) for key in reflections.iterkeys()])
            matches = document.search(regex, IgnoreCase + WholeWordsOnly + RegExp, start = page)
            for match in matches:
                if match.begin().wordArea()[1] == 0:
                    itemName = match.text().lower().strip()
                    annotation = annotations.get(itemName, None)
                    if annotation is None and itemName in reflections:
                        annotation = Annotation()
                        annotation['concept'] = 'Reflection'
                        annotation['property:webpageUrl'] = \
                            'http://reflect.ws/fcgi-bin/solveAmbig.fcgi?entities=%s' % \
                            ';'.join(['%d.%s' % (t, id) for (t, id) in reflections[itemName]])
                        annotation['property:name'] = itemName
                        annotation['session:overlay'] = 'hyperlink'
                        annotation['session:color'] = '#0A0'
                        annotations[itemName] = annotation
                        seenItemNames.add(itemName)
                    if annotation is not None:
                        annotation.addExtent(match)
                    else:
                        print "ERROR: matched '%s' but could not find in reflections map" % itemName.encode('utf8')
                        print reflections.keys()

            document.addAnnotations(annotations.values())
class _GenericAnnotator(utopia.document.Annotator):
    """Annotates text with urls pointing to a website.

    Derived classes should specialize with the list of terms, and
    generating URL per given term
    """

    _name = None

    def get_terms(self):
        raise NotImplementedError

    def get_url(self, term):
        raise NotImplementedError

    def get_terms_regex(self):
        terms = self.get_terms()
        _debug("Got %d terms" % len(terms))
        r = r'(%s)' % '|'.join(terms)
        _debug("IS IT THERE: %s" % ("functional abnormalities" in r))
        _debug(r)
        return r  # re.compile("functional abnormalities")
        #return re.compile("(functional abnormalities|structural abnormalities|temporoparietal junction|ventrolateral prefrontal)")

    @utopia.document.buffer  # Wrap/buffer the function
    #def on_ready_event(self, document):
    def on_activate_event(self, document):
        _debug('activate base')
        # Scan the document for some regular expression
        matches = document.search(
            self.get_terms_regex(),
            spineapi.RegExp + spineapi.WholeWordsOnly + spineapi.IgnoreCase)

        to_add = {}  # Dictionary of annotations to add

        try:
            for match in matches:
                _debug("Match %s" % str(match))
                # Sanitise matches found in document for dict keys
                match_text = match.text().lower().strip()

                # Has same text already been annotated?
                annotation = to_add.get(match_text, None)

                if annotation is None:
                    # If no, create new annotation
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'Annotation %s' % self._name
                    annotation['property:name'] = match_text
                    annotation[
                        'property:description'] = 'Link to %s' % self._name
                    annotation['property:webpageUrl'] = self.get_url(
                        match_text)
                    annotation['session:overlay'] = 'hyperlink'
                    annotation['session:color'] = '#00AA00'  # Green
                    to_add[match_text] = annotation

                if annotation is not None:
                    # Add the match to the annotation, in any case
                    _debug("Added %s" % str(annotation))
                    annotation.addExtent(match)
        except Exception, e:
            _debug("ERROR: %s" % str(e))

        # Finally, add the annotations to the document
        document.addAnnotations(to_add.values())
        _debug('finished activate base')