def on_activate_event(self, document): regex = r'(functional abnormalities)' # Scan the document for some regular expression matches = document.search(regex, spineapi.RegExp + spineapi.WholeWordsOnly) to_add = {} # Dictionary of annotations to add for match in matches: # Sanitise matches found in document for dict keys match_text = match.text().lower().strip() match_text_quoted = urllib.quote(match_text) # Has same text already been annotated? annotation = to_add.get(match_text, None) if annotation is None: # If no, create new annotation annotation = spineapi.Annotation() annotation['concept'] = 'NeuroSynthAnnotation' annotation['property:name'] = match_text annotation['property:description'] = 'Link to NeuroSynth' annotation['property:webpageUrl'] = 'http://beta.neurosynth.org/features/{0}/'.format(match_text_quoted) annotation['session:overlay'] = 'hyperlink' annotation['session:color'] = '#00AA00' # Green to_add[match_text] = annotation if annotation is not None: # Add the match to the annotation, in any case annotation.addExtent(match) # Finally, add the annotations to the document document.addAnnotations(to_add.values())
def on_activate_event(self, document): text = document.text().encode('utf8') text_hash = hashlib.md5(text).hexdigest() url = 'http://beta.sciencewise.info/api/utopia' payload = urllib.urlencode({'text': text, 'chksum': text_hash}) response = urllib2.urlopen(url, payload, timeout=8).read() results = json.loads(response) annotations = [] for result in results: before = result.get('context', {}).get('before', '') term = result.get('value', '') after = result.get('context', {}).get('after', '') link = result.get('link') definitions = [] for definition in result.get('definitions', []): definitions.append( (definition.get('url'), definition.get('title'))) if len(term) > 0 and len(before) + len(term) + len( after) > 0 and link is not None: matches = document.findInContext(before, term, after) if len(matches) > 0: annotation = spineapi.Annotation() annotation['concept'] = 'ScienceWISE' annotation['property:webpageUrl'] = link annotation['property:term'] = term annotation['property:name'] = 'Definitions of {0}'.format( term) annotation[ 'property:description'] = 'ScienceWISE ontology definitions' annotation['property:sourceDatabase'] = 'sciencewise' annotation[ 'property:sourceDescription'] = '<p><a href="http://sciencewise.info/">ScienceWISE</a> provides phycists with article annotation and scientific bookmarking.</p>' for url, title in definitions: annotation.insertProperty('property:definitions', '{0} {1}'.format(url, title)) for match in matches: annotation.addExtent(match) annotations.append(annotation) if len(annotations) > 0: document.addAnnotations(annotations)
def on_activate_event(self, document): text = document.text().encode('utf8') text_hash = hashlib.md5(text).hexdigest() url = 'http://beta.sciencewise.info/api/utopia' payload = urllib.urlencode({ 'text': text, 'chksum': text_hash }) response = urllib2.urlopen(url, payload, timeout=8).read() results = json.loads(response) annotations = [] for result in results: before = result.get('context', {}).get('before', '') term = result.get('value', '') after = result.get('context', {}).get('after', '') link = result.get('link') definitions = [] for definition in result.get('definitions', []): definitions.append((definition.get('url'), definition.get('title'))) if len(term) > 0 and len(before) + len(term) + len(after) > 0 and link is not None: matches = document.findInContext(before, term, after) if len(matches) > 0: annotation = spineapi.Annotation() annotation['concept'] = 'ScienceWISE' annotation['property:webpageUrl'] = link annotation['property:term'] = term annotation['property:name'] = 'Definitions of {0}'.format(term) annotation['property:description'] = 'ScienceWISE ontology definitions' annotation['property:sourceDatabase'] = 'sciencewise' annotation['property:sourceDescription'] = '<p><a href="http://sciencewise.info/">ScienceWISE</a> provides phycists with article annotation and scientific bookmarking.</p>' for url, title in definitions: annotation.insertProperty('property:definitions', '{0} {1}'.format(url, title)) for match in matches: annotation.addExtent(match) annotations.append(annotation) if len(annotations) > 0: document.addAnnotations(annotations)
def on_activate_event(self, document): ns = {'r': 'Reflect'} maxTextFragmentSize = 1000000 textFragments = [] seenItemNames = set() ignoredEntityTypes = [-11] # Retrieve the full text of the document, split into fragments for page in document.pages(): pageText = re.sub(r'\s+', r' ', page.pageText()) if len(textFragments) == 0 or len(textFragments[-1][0]) + len( pageText) > maxTextFragmentSize: textFragments.append([pageText, page]) else: textFragments[-1][0] = textFragments[-1][0] + ' ' + pageText for text, page in textFragments: # Package it as URL encoded form encoding payload = 'document=%s' % urllib.quote(text.encode('utf8')) # Send it off to the reflect server response = urllib2.urlopen("http://reflect.ws/REST/GetEntities", payload, timeout=8) # Parse response root = etree.fromstring(response.read(), self.parser) reflections = {} annotations = {} for item in root.xpath('//r:item', namespaces=ns): itemName = etree.tostring(item.find('{%s}name' % ns['r']), method="text", encoding=unicode, with_tail=False).lower().strip() if itemName not in seenItemNames: for entity in item.xpath('.//r:entity', namespaces=ns): entityType = entity.findtext('{%s}type' % ns['r']) if entityType is not None: entityType = int(entityType) if entityType not in ignoredEntityTypes: entityIdentifier = entity.findtext( '{%s}identifier' % ns['r']) if itemName not in reflections: reflections[itemName] = set() reflections[itemName].add( (entityType, entityIdentifier)) # For each match, create an annotation that the UI will handle later regex = '(%s)' % '|'.join( [re.escape(key) for key in reflections.iterkeys()]) matches = document.search(regex, IgnoreCase + WholeWordsOnly + RegExp, start=page) for match in matches: if match.begin().wordArea()[1] == 0: itemName = match.text().lower().strip() annotation = annotations.get(itemName, None) if annotation is None and itemName in reflections: annotation = Annotation() annotation['concept'] = 'Reflection' annotation['property:webpageUrl'] = \ 'http://reflect.ws/fcgi-bin/solveAmbig.fcgi?entities=%s' % \ ';'.join(['%d.%s' % (t, id) for (t, id) in reflections[itemName]]) annotation['property:name'] = itemName annotation['session:overlay'] = 'hyperlink' annotation['session:color'] = '#0A0' annotations[itemName] = annotation seenItemNames.add(itemName) if annotation is not None: annotation.addExtent(match) else: print "ERROR: matched '%s' but could not find in reflections map" % itemName.encode( 'utf8') print reflections.keys() document.addAnnotations(annotations.values())
def after_ready_event(self, document): print 'Formatting metadata' # Find highest matching metadata accumulation list for references source = None for accListLink in document.getAccLists('metadata'): matches = document.annotationsIf({'concept': 'Citation'}, accListLink['scratch']) if len(matches) > 0: print 'Selected for [Citation] list %s with rank %s' % ( accListLink['scratch'], repr(accListLink.get('rank', 0))) source = accListLink bibliography = list(matches) bibliography.sort(key=sortfn) rt = '' for annotation in bibliography: citation = utopia.tools.utils.citation_from_annotation( annotation) rt += utopia.citation.render(citation, links=True) if len(bibliography) > 0: # Create Metadata link annotation link = document.newAccList('citation_list') link['property:list_name'] = 'Bibliography' document.addAnnotations(bibliography, link['scratch']) if len(rt) > 0: references = spineapi.Annotation() references['displayBibliography'] = rt references['concept'] = 'BibliographyMetadata' references['property:identifier'] = '#bibliography' references['property:name'] = 'Bibliography' references['displayName'] = 'Bibliography' references['displayRelevance'] = '800' if accListLink is not None: for i in ('sourceIcon', 'sourceTitle', 'sourceDescription', 'sourceDatabase'): k = 'property:{0}'.format(i) if k in accListLink: references[k] = accListLink[k] references[ 'property:description'] = 'From ' + accListLink[ 'property:sourceTitle'] document.addAnnotation(references) break if source is None: print 'No metadata found' # Find highest matching metadata accumulation list for in-text citations for accListLink in document.getAccLists('metadata'): matches = document.annotationsIf({'concept': 'ForwardCitation'}, accListLink['scratch']) if len(matches) > 0: print 'Selected for [ForwardCitation] list %s with rank %s' % ( accListLink['scratch'], repr(accListLink.get('rank', 0))) document.addAnnotations(matches) break # Find highest matching metadata accumulation list for in-text citations for accListLink in document.getAccLists('metadata'): matches = document.annotationsIf({'concept': 'Table'}, accListLink['scratch']) if len(matches) > 0: print 'Selected for [Table] list %s with rank %s' % ( accListLink['scratch'], repr(accListLink.get('rank', 0))) document.addAnnotations(matches) break metadata = None if source is not None: for annotation in document.annotations(source['scratch']): if annotation.get('concept') == 'DocumentMetadata': metadata = annotation if metadata: metadata['displayName'] = 'Document Information' metadata['displayRelevance'] = '1000' document.addAnnotation(metadata, 'Document Metadata')
def on_activate_event(self, document): ns = {'r': 'Reflect'} maxTextFragmentSize = 1000000 textFragments = [] seenItemNames = set() ignoredEntityTypes = [-11] # Retrieve the full text of the document, split into fragments for page in document.pages(): pageText = re.sub(r'\s+', r' ', page.pageText()) if len(textFragments) == 0 or len(textFragments[-1][0]) + len(pageText) > maxTextFragmentSize: textFragments.append([pageText, page]) else: textFragments[-1][0] = textFragments[-1][0] + ' ' + pageText for text, page in textFragments: # Package it as URL encoded form encoding payload = 'document=%s' % urllib.quote(text.encode('utf8')) # Send it off to the reflect server response = urllib2.urlopen("http://reflect.ws/REST/GetEntities", payload, timeout=8) # Parse response root = etree.fromstring(response.read(), self.parser) reflections = {} annotations = {} for item in root.xpath('//r:item', namespaces = ns): itemName = item.findtext('{%s}name' % ns['r']).lower().strip() if itemName not in seenItemNames: for entity in item.xpath('.//r:entity', namespaces = ns): entityType = entity.findtext('{%s}type' % ns['r']) if entityType is not None: entityType = int(entityType) if entityType not in ignoredEntityTypes: entityIdentifier = entity.findtext('{%s}identifier' % ns['r']) if itemName not in reflections: reflections[itemName] = set() reflections[itemName].add((entityType, entityIdentifier)) # For each match, create an annotation that the UI will handle later regex = '(%s)' % '|'.join([re.escape(key) for key in reflections.iterkeys()]) matches = document.search(regex, IgnoreCase + WholeWordsOnly + RegExp, start = page) for match in matches: if match.begin().wordArea()[1] == 0: itemName = match.text().lower().strip() annotation = annotations.get(itemName, None) if annotation is None and itemName in reflections: annotation = Annotation() annotation['concept'] = 'Reflection' annotation['property:webpageUrl'] = \ 'http://reflect.ws/fcgi-bin/solveAmbig.fcgi?entities=%s' % \ ';'.join(['%d.%s' % (t, id) for (t, id) in reflections[itemName]]) annotation['property:name'] = itemName annotation['session:overlay'] = 'hyperlink' annotation['session:color'] = '#0A0' annotations[itemName] = annotation seenItemNames.add(itemName) if annotation is not None: annotation.addExtent(match) else: print "ERROR: matched '%s' but could not find in reflections map" % itemName.encode('utf8') print reflections.keys() document.addAnnotations(annotations.values())
class _GenericAnnotator(utopia.document.Annotator): """Annotates text with urls pointing to a website. Derived classes should specialize with the list of terms, and generating URL per given term """ _name = None def get_terms(self): raise NotImplementedError def get_url(self, term): raise NotImplementedError def get_terms_regex(self): terms = self.get_terms() _debug("Got %d terms" % len(terms)) r = r'(%s)' % '|'.join(terms) _debug("IS IT THERE: %s" % ("functional abnormalities" in r)) _debug(r) return r # re.compile("functional abnormalities") #return re.compile("(functional abnormalities|structural abnormalities|temporoparietal junction|ventrolateral prefrontal)") @utopia.document.buffer # Wrap/buffer the function #def on_ready_event(self, document): def on_activate_event(self, document): _debug('activate base') # Scan the document for some regular expression matches = document.search( self.get_terms_regex(), spineapi.RegExp + spineapi.WholeWordsOnly + spineapi.IgnoreCase) to_add = {} # Dictionary of annotations to add try: for match in matches: _debug("Match %s" % str(match)) # Sanitise matches found in document for dict keys match_text = match.text().lower().strip() # Has same text already been annotated? annotation = to_add.get(match_text, None) if annotation is None: # If no, create new annotation annotation = spineapi.Annotation() annotation['concept'] = 'Annotation %s' % self._name annotation['property:name'] = match_text annotation[ 'property:description'] = 'Link to %s' % self._name annotation['property:webpageUrl'] = self.get_url( match_text) annotation['session:overlay'] = 'hyperlink' annotation['session:color'] = '#00AA00' # Green to_add[match_text] = annotation if annotation is not None: # Add the match to the annotation, in any case _debug("Added %s" % str(annotation)) annotation.addExtent(match) except Exception, e: _debug("ERROR: %s" % str(e)) # Finally, add the annotations to the document document.addAnnotations(to_add.values()) _debug('finished activate base')