def resolve(self, citations, document=None): citation = {} pubmed_id = utopia.citation.pick_from(citations, 'identifiers[pubmed]', None, record_in=citation) if pubmed_id is None: doi = utopia.citation.pick_from(citations, 'identifiers[doi]', None, record_in=citation) if doi is not None: pubmed_id = utopia.tools.pubmed.identify(doi, 'doi') if pubmed_id is not None: citation['identifiers'] = {'pubmed': pubmed_id} if pubmed_id is None: title = utopia.citation.pick_from(citations, 'title', None, record_in=citation) if title is not None: title = title.strip(' .') pubmed_results = utopia.tools.pubmed.search(title) pubmed_title = pubmed_results.get('title', '').strip(' .') if len(pubmed_title) > 0: matched = False pubmed_pmid = pubmed_results.get('identifiers', {}).get('pubmed') if re.sub(r'[^\w]+', ' ', title).strip().lower() == re.sub( r'[^\w]+', ' ', pubmed_title).strip( ).lower(): # Fuzzy match matched = True elif document is not None: # Accept the pubmed title over the scraped title, if present in the document matches = document.findInContext( '', pubmed_title, '') # Fuzzy match if len(matches) > 0: matched = True pubmed_title = matches[0].text() if matched: citation.update(pubmed_results) citation['title'] = pubmed_title return citation
def on_activate_event(self, document): text = document.text().encode('utf8') text_hash = hashlib.md5(text).hexdigest() url = 'http://beta.sciencewise.info/api/utopia' payload = urllib.urlencode({'text': text, 'chksum': text_hash}) response = urllib2.urlopen(url, payload, timeout=8).read() results = json.loads(response) annotations = [] for result in results: before = result.get('context', {}).get('before', '') term = result.get('value', '') after = result.get('context', {}).get('after', '') link = result.get('link') definitions = [] for definition in result.get('definitions', []): definitions.append( (definition.get('url'), definition.get('title'))) if len(term) > 0 and len(before) + len(term) + len( after) > 0 and link is not None: matches = document.findInContext(before, term, after) if len(matches) > 0: annotation = spineapi.Annotation() annotation['concept'] = 'ScienceWISE' annotation['property:webpageUrl'] = link annotation['property:term'] = term annotation['property:name'] = 'Definitions of {0}'.format( term) annotation[ 'property:description'] = 'ScienceWISE ontology definitions' annotation['property:sourceDatabase'] = 'sciencewise' annotation[ 'property:sourceDescription'] = '<p><a href="http://sciencewise.info/">ScienceWISE</a> provides phycists with article annotation and scientific bookmarking.</p>' for url, title in definitions: annotation.insertProperty('property:definitions', '{0} {1}'.format(url, title)) for match in matches: annotation.addExtent(match) annotations.append(annotation) if len(annotations) > 0: document.addAnnotations(annotations)
def on_activate_event(self, document): text = document.text().encode('utf8') text_hash = hashlib.md5(text).hexdigest() url = 'http://beta.sciencewise.info/api/utopia' payload = urllib.urlencode({ 'text': text, 'chksum': text_hash }) response = urllib2.urlopen(url, payload, timeout=8).read() results = json.loads(response) annotations = [] for result in results: before = result.get('context', {}).get('before', '') term = result.get('value', '') after = result.get('context', {}).get('after', '') link = result.get('link') definitions = [] for definition in result.get('definitions', []): definitions.append((definition.get('url'), definition.get('title'))) if len(term) > 0 and len(before) + len(term) + len(after) > 0 and link is not None: matches = document.findInContext(before, term, after) if len(matches) > 0: annotation = spineapi.Annotation() annotation['concept'] = 'ScienceWISE' annotation['property:webpageUrl'] = link annotation['property:term'] = term annotation['property:name'] = 'Definitions of {0}'.format(term) annotation['property:description'] = 'ScienceWISE ontology definitions' annotation['property:sourceDatabase'] = 'sciencewise' annotation['property:sourceDescription'] = '<p><a href="http://sciencewise.info/">ScienceWISE</a> provides phycists with article annotation and scientific bookmarking.</p>' for url, title in definitions: annotation.insertProperty('property:definitions', '{0} {1}'.format(url, title)) for match in matches: annotation.addExtent(match) annotations.append(annotation) if len(annotations) > 0: document.addAnnotations(annotations)
def on_ready_event(self, document): '''Fetch information from the Lazarus service''' permission = self.get_config('permission', False) if permission: # If an outline already exists, don't make a new one needs_outline = True for annotation in document.annotations(): if annotation.get('concept') == 'OutlineItem': needs_outline = False break # The Lazarus server needs to know what this document is document_id = utopia.tools.utils.metadata(document, 'identifiers[utopia]') this_doi = utopia.tools.utils.metadata(document, 'identifiers[doi]') if this_doi is not None: this_doi = u'doi:' + this_doi # Speak to server params = {'fingerprint': document.fingerprints()} url = '{0}?{1}'.format(laz_docUrl, urllib.urlencode(params, doseq=True)) response = urllib2.urlopen(url, timeout=60) if response.getcode() == 204: request = urllib2.Request( url, data=document.data(), headers={'Content-Type': 'application/pdf'}) response = urllib2.urlopen(request, timeout=60) #response = open('/Users/dave/Desktop/ananiadou_tibtech06.pdf-response.xml', 'r') # Create Metadata link annotation link = document.newAccList('metadata', 50) link['property:sourceDatabase'] = 'lazarus' link['property:sourceTitle'] = 'Lazarus' link['property:sourceDescription'] = self.sourceDescription link['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/lazarus-prefs-logo.png', 'image/png') headers = [] pos = [] refs = [] annotations = [] concepts = {} hits = [] expression_annotations = [] for kAnnotation in kend.converter.XML.parse( response, kend.model.Document): #print kend.converter.XML.serialise(kAnnotation)[0] try: annotations.append( utopia.tools.converters.Annotation.kend2spineapi( kAnnotation, document)) except: pass annotations.sort(key=lambda a: int(a.get('structure:order', 0))) for sAnnotation in annotations: if sAnnotation['concept'] == 'structure_element': role, level = self.getHeaderRole(sAnnotation) if role is not None and needs_outline: while len(pos) < level: pos.append(0) while len(pos) > level: pos.pop() pos[-1] += 1 outline = u'.'.join([unicode(i) for i in pos]) anchor_name = '#lazarus.outline.{0}'.format(outline) anchor = spineapi.Annotation() anchor['concept'] = 'Anchor' anchor['property:anchor'] = anchor_name anchor.addExtents(sAnnotation.extents()) anchor.addAreas(sAnnotation.areas()) document.addAnnotation(anchor) header = spineapi.Annotation() header['concept'] = 'OutlineItem' header['property:outlinePosition'] = outline header['property:outlineTitle'] = u' '.join( [e.text() for e in sAnnotation.extents()]) header['property:destinationAnchorName'] = anchor_name document.addAnnotation(header) print((u' ' * level + u'.'.join([unicode(i) for i in pos]) + u' ' + u' '.join([ e.text() for e in sAnnotation.extents() ])).encode('utf8')) elif 'bibitem' in sAnnotation.getAllProperties( 'structure:role'): #refs.append(sAnnotation) pass elif sAnnotation['concept'] == 'Citation': # Hack to fix a mistake in authors property name if 'property:author' in sAnnotation and not 'property:authors' in sAnnotation: sAnnotation[ 'property:authors'] = sAnnotation.getAllProperties( 'property:author') refs.append(sAnnotation) elif sAnnotation['concept'] == 'LazarusConcept': concept_id = sAnnotation.get('property:identifier') if concept_id is not None: sAnnotation['id'] = str(uuid.uuid4()) concepts[concept_id] = sAnnotation document.addAnnotation(sAnnotation, 'Lazarus Concept') elif sAnnotation['concept'] == 'LazarusConceptHit': hits.append(sAnnotation) elif sAnnotation['concept'] == 'LazarusSentenceExpression': expression_annotations.append(sAnnotation) else: document.addAnnotation(sAnnotation) for ref in refs: #print(ref.get('structure:order', '0')) pass refs = sorted(refs, key=lambda ref: int(ref.get('property:order', '0'))) for ref in refs: #print(ref.get('structure:order', '0')) pass for ref in refs: # Create Bibliography annotations #citation = {'unstructured': u' '.join([e.text() for e in ref.extents()])} #annotation = utopia.tools.utils.citation_to_annotation(citation) #annotation['property:order'] = ref.get('structure:order') #annotation.addExtents(ref.extents()) #annotation.addAreas(ref.areas()) #document.addAnnotation(annotation, link['scratch']) document.addAnnotation(ref, link['scratch']) # Now link hits to concepts for i, hit in enumerate(hits): concept_id = hit.get('property:identifier') concept = concepts.get(concept_id) if concept is not None: concept_uuid = concept.get('id') hit['property:concept_id'] = concept_uuid identifier = concept.get('property:identifier') name = concept.get('property:name', '???') sources = concept.get('property:externalSources', 'json:[]') if sources.startswith('json:'): sources = json.loads(sources[5:]) if 'property:stdInchiKey' in concept: sources.append({ 'database': ' InchiKey', 'identifier': concept['property:stdInchiKey'] }) if 'property:canonicalSmiles' in concept: sources.append({ 'database': ' SMILES', 'identifier': concept['property:canonicalSmiles'] }) kind = concept.get('property:kind') kind = self.dbs.get(kind, {}).get('title', kind) links = {} for source in sources: uri = source.get('uri') if 'primary' in source.get('relationship', []): links.setdefault('definition', []) links['definition'].append(u''' <a href="{uri}" title="{uri}">{database}</a> '''.format(**source)) elif uri is None: if source.get('database') in (' InchiKey', ' SMILES'): links.setdefault('main', []) links['main'].append(u''' <tr><td>{database}:</td><td>{identifier}</td></tr> '''.format(**source)) else: identifier = source.get('identifier') links_category = 'xref' if 'seeAlso' in source.get('relationship', []) or uri is None: links_category = 'seeAlso' links.setdefault(links_category, []) if identifier is not None: links[links_category].append(u''' <a href="{uri}" title="{uri}">{name}...</a> ({identifier}) '''.format(**source)) else: links[links_category].append(u''' <a href="{uri}" title="{uri}">{name}...</a> '''.format(**source)) style = u''' <style> .lazarus-table tbody { border: none; } .lazarus-table td:first-of-type { text-align: right; font-weight: bold; } .lazarus-table td { vertical-align: top; } .lazarus-table td:first-of-type { white-space: nowrap; } .lazarus-table td:not(:first-of-type) { word-break: break-all; } .lazarus-table tr td { padding-top: 0ex; padding-bottom: 0ex; } .lazarus-table tbody:not(:first-of-type) tr:first-of-type td { padding-top: 1ex; } </style> ''' html = u''' <table class="lazarus-table"> <tr><td>Name:</td><td>{name}</td></tr> '''.format(**{'name': name}) categories = { 'xref': 'Related:', 'seeAlso': 'See also:', 'definition': 'Defined in:' } for links_category in ('main', 'xref', 'seeAlso', 'definition'): links_title = categories.get(links_category) these_links = sorted( list(set(links.get(links_category, [])))) if len(these_links) > 0: html += '<tbody>' if links_category != 'main': html += u'<tr><td>{0}</td><td>'.format( links_title) html += u'<br>'.join(these_links) html += '</td></tr>' else: html += ''.join(these_links) html += '</tbody>' #pprint('------------------------') html += u''' </table> ''' #print(html) hasLinks = len( links.get('xref', []) + links.get('seeAlso', [])) > 0 ann = spineapi.Annotation() ann['concept'] = 'Collated' ann['property:name'] = u'{0}'.format(name) ann['property:description'] = 'Lazarus Concept' ann['session:semanticTerm'] = name ann['property:html'] = [style, html] ann['property:sourceDescription'] = self.sourceDescription ann['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/lazarus-prefs-logo.png', 'image/png') ann['session:overlay'] = 'hyperlink' ann['session:color'] = '#880000' count = 0 print('====', 7) if 'property:hitFragments' in hit: hitFragments = hit.getAllProperties( 'property:hitFragments') or [] #pprint(hitFragments) for hitFragment in hitFragments: pre, _, rest = hitFragment.partition('{!') match, _, post = rest.partition('!}') #pprint((pre, match, post)) matches = document.findInContext(pre, match, post, fuzzy=True) count += len(matches) ann.addExtents(matches) if hasLinks and count > 0: document.addAnnotation(ann) style = u''' <style> .lazarus-expression .box { background-color: #FFF0E8; border-color: #EEE0D8; } .lazarus-related { padding-left: 42px; background-image: url(%s); background-repeat: no-repeat; background-position: top left; background-size: 37px 48px; min-height: 53px; } .lazarus-related + .lazarus-related { margin-top: 5px; border-top: 1px dotted #aaa; padding-top: 5px; background-position-y: 5px; min-height: 58px; } .lazarus-sentence { padding-left: 0.5em; color: black; } .lazarus-sentence.negative { border-left: solid 5px #bb0000; } .lazarus-sentence.positive { border-left: solid 5px #008800; } .lazarus-sentence.negative a { color: #bb0000; } .lazarus-sentence.positive a { color: #008800; } </style> ''' % utopia.get_plugin_data_as_url('images/pdf-page-icon.png', 'image/png') expressions = [] for sAnnotation in expression_annotations: exp = sAnnotation.get('property:expressions', 'json:{}') if exp.startswith('json:'): exp = json.loads(exp[5:]) context = sAnnotation.get('property:context') if context is not None: if exp.get('negative', False): exp['posneg'] = 'negative' else: exp['posneg'] = 'positive' pprint(context) pprint(exp) matched_context = exp.get('context') matches = [] if matched_context is not None: matches = document.search( re.sub(r'\s+', ' ', matched_context)) if len(matches) > 0: anchor_id = str(uuid.uuid4())[1:-1] anchor = spineapi.Annotation() anchor['concept'] = 'Anchor' anchor['property:anchor'] = anchor_id anchor.addExtents(matches) document.addAnnotation(anchor) exp.update({ 'anchor_id': anchor_id, 'sentence': context }) expressions.append(exp) js = u''' <script> $(document).on('DOMNodeInserted', function(e) { var element = e.target; $(element).filter('a[target="tab"]').add('a[target="tab"]', element).each(function () { var fragment = $(this).closest('.-papyro-internal-citation').data('citation')['userdef']['first_fragment']; $(this).attr('target', 'pdf; show=highlight; text=[' + encodeURIComponent(fragment) + ']'); }); }); $(function () { var lazarus = { expressions: %s, fingerprints: %s, relUrl: %s }; var more_expressions_link = $('#lazarus-expression > p.more').hide(); var more_expressions_spinner = $('#lazarus-expression > div.spinner'); Spinners.create(more_expressions_spinner); Spinners.play(more_expressions_spinner); var exp_divs = []; var identifiers = []; for (var e = 0; e < lazarus.expressions.length; e++) { var expression = lazarus.expressions[e]; var exp_div = $('<div class="box"></div>'); exp_div.data('expression', expression); exp_div.hide(); exp_divs.push(exp_div); identifiers.push(expression.identifiers); } var params = { fingerprint: lazarus.fingerprints }; var url = lazarus.relUrl + '?' + $.param(params, traditional=true); $.ajax({ url: url, type: 'POST', dataType: 'json', data: JSON.stringify(identifiers), contentType: "application/json", error: function (xhr, ajaxOptions, thrownError) { console.log(xhr.statusText); console.log(xhr.responseText); console.log(xhr.status); console.log(thrownError); // FIXME do something here Spinners.remove(more_expressions_spinner); }, success: function (related) { // Sort related according to the number of articles found related.results.sort(function (l, r) { var lv = Object.keys(l.related).length; var rv = Object.keys(r.related).length; return (lv > rv) ? -1 : (lv < rv) ? 1 : 0; }); $.each(related.results, function (idx, result) { var exp_div = exp_divs[idx]; var expression = exp_div.data('expression'); expression.related = result.related; delete expression.related[%s]; split = expression.sentence.split(expression.context); pre = split[0]; pre = pre.replace(/(\w)$/, '$1 '); pre = pre.replace(/^\s*/, ''); match = expression.context; post = split[1]; post = post.replace(/^(\w)/, ' $1'); post = post.replace(/\s*$/, ''); expression.pre = pre; expression.match = match; expression.post = post; // Create expression element exp_div.append('<p class="lazarus-sentence ' + expression.posneg + '">“' + expression.pre + '<a target="pdf; show=select; anchor=' + expression.anchor_id + '"><strong>' + expression.match + '</strong></a>' + expression.post + '”</p>'); exp_div.data('expression', expression); $('#lazarus-expression > .content').append(exp_div); if (Object.keys(expression.related).length > 0) { var related_div = $('<div class="expandable" title="Related expressions elsewhere"></div>'); var related_div_content = $('<div></div>').appendTo(related_div); function on_expand() { related_div.off('papyro:expandable:expand', on_expand); $.each(expression.related, function (idx, obj) { fragments = []; $.each(obj, function (id, obj) { fragments.push(obj.context); }); fragments.join('\\n'); related_div_content.append($('<div class="lazarus-related unprocessed"></div>').append('<p><strong>“…'+fragments+'…”</strong></p>').hide().data('citation', {identifiers:{doi:idx},userdef:{first_fragment:fragments[0]}})); // .append(utopia.citation.render({identifiers:{doi:idx},first_fragment:fragments[0]}, true, true)) }); expression.related.length = 0; // empty for future if ($('.lazarus-related.unprocessed', exp_div).length > 0) { var more = $('<p class="more right"><a class="more">More related articles...</a></p>'); related_div_content.append(more); function show_five_related(e) { e.preventDefault(); $('.lazarus-related.unprocessed', exp_div).slice(0, 5).each(function (idx, obj) { var citation = $(obj).data('citation'); $(obj).append(utopia.citation.render(citation, true, true)); $(obj).show().removeClass('unprocessed'); }); if ($('.lazarus-related.unprocessed', exp_div).length == 0) { more.remove(); } } more.on('click', show_five_related).click(); } } related_div.on('papyro:expandable:expand', on_expand); exp_div.append(related_div); utopia.processNewContent(related_div); } }); Spinners.remove(more_expressions_spinner); more_expressions_link.show(); $('a.more', more_expressions_link).click(); } }); function append_five(e) { e.preventDefault(); // Show the next five $('#lazarus-expression > .content').children().filter(':hidden').slice(0,5).show(); // Hide the 'more' link if everything is now visible if ($('#lazarus-expression > .content').children().filter(':hidden').length == 0) { more_expressions_link.hide(); } } // Hook up 'more' link $('#lazarus-expression > p.more > a.more').on('click', append_five).click(); }); </script> ''' % (json.dumps(expressions), json.dumps( document.fingerprints()), json.dumps(laz_docRelUrl), json.dumps(this_doi)) #print(js.encode('utf8')) html = u''' <div id="lazarus-expression"><div class="content"></div><div class="spinner"></div><p class="more"><a class="more">More expressions...</a></p></div> ''' if len(expressions) > 0: ann = spineapi.Annotation() ann['concept'] = 'Collated' ann['property:name'] = 'Lazarus Expressions' ann['property:description'] = u'Summarizing expression(s)' ann['property:html'] = [js, style, html] ann['property:sourceDescription'] = self.sourceDescription ann['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/lazarus-prefs-logo.png', 'image/png') document.addAnnotation(ann) else: # no permission noprompt = self.get_config('noprompt', False) if not noprompt: annotation = spineapi.Annotation() annotation['concept'] = 'Collated' params = { 'uuid': self.uuid(), } annotation['property:html'] = utopia.get_plugin_data( 'tpl/denied.html').format(**params) annotation['property:name'] = 'Lazarus' annotation[ 'property:description'] = 'Lazarus functionality is turned off' annotation[ 'property:sourceDescription'] = self.sourceDescription annotation[ 'property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/lazarus-prefs-logo.png', 'image/png') annotation['session:default'] = '1' document.addAnnotation(annotation)
def on_ready_event(self, document): volume, page = None, None # Only send if the DOI has a Portland prefix doi = utopialib.utils.metadata(document, 'identifiers[doi]') if doi is not None and doi[:7] in registrants: crossref_unixref = utopialib.utils.metadata( document, 'raw_crossref_unixref') if crossref_unixref is not None: # Parse CrossRef redirect URL dom = etree.fromstring(crossref_unixref.encode('utf8')) resource = dom.findtext( 'doi_record/crossref/journal/journal_article/doi_data/resource' ) if resource is not None: match = self.resourceRegExp.match(resource) if match is not None: volume, page = match.groups() ### FIXME What information should be shown? Portland? BJ? #annotation = spineapi.Annotation() #annotation['concept'] = 'PublisherIdentity' #annotation['property:logo'] = utopia.get_plugin_data_as_url('images/logo.png', 'image/png') #annotation['property:title'] = 'Portland Press Limited' #annotation['property:webpageUrl'] = 'http://www.portlandpress.com/' #document.addAnnotation(annotation, 'PublisherMetadata') # If this document was resolved, off we go to fetch the NLM if None not in (volume, page): # Make a request to the utopia ext web service url = 'https://utopia.cs.manchester.ac.uk/ext/portland/nlm?{0}' url = url.format(urllib.urlencode({ 'volume': volume, 'page': page })) try: nlm = urllib2.urlopen(url, timeout=8).read() except: raise return info = utopialib.nlm.parse(nlm) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/biochemj.png', 'image/png') link['property:sourceTitle'] = 'Portland' link['property:sourceDescription'] = ''' <p><a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p> ''' # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation( citation) document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format( citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) #print regex matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): doi = utopialib.utils.metadata(document, 'identifiers[doi]') if doi is not None: info = {} # Resolve the DOI to find the publisher's website response = urllib2.urlopen('http://dx.doi.org/{0}'.format(doi), timeout=8) # Parse page to find (if there) the full text URL parser = etree.HTMLParser() html = etree.parse(response, parser) # Only continue if this is a highwire HTML page if len(html.xpath("/html/head/meta[@name='HW.identifier']")) == 0: return # Now make sure we have the full text XHTML citation_fulltext_html_url = html.xpath( "/html/head/meta[@name='citation_fulltext_html_url']/@content") if len(citation_fulltext_html_url) > 0: citation_fulltext_html_url = citation_fulltext_html_url[0] # Fetch that full text page (if different to the current one) if citation_fulltext_html_url != response.geturl(): response = urllib2.urlopen(citation_fulltext_html_url, timeout=8) html = etree.parse(response, parser) #print etree.tostring(html, pretty_print=True, encoding='utf8') # Now parse out the bibliography info['citations'] = [] info['citations_by_id'] = {} for bibitem in html.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' ref-list ')]//ol[contains(concat(' ', normalize-space(@class), ' '), ' cit-list ')]/li" ): citation = query( bibitem, { 'id': 'a/@id', 'label': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' ref-label ')]/text()", 'title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-article-title ')]/text()", 'year': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-date ')]/text()", 'publication-title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-jnl-abbrev ')]/text()", 'volume': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-vol ')]/text()", 'issue': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-issue ')]/text()", 'pagefrom': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-fpage ')]/text()", 'pageto': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-lpage ')]/text()", 'pmid': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-pmid ')]/text()", 'doi': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-doi ')]/text()", 'etree': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-metadata ')]", }) authors = [] for a in bibitem.xpath( ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-auth ')]" ): surname = a.xpath( ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-surname ')]/text()" ) given_names = a.xpath( ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-given-names ')]/text()" ) if len(surname) > 0 and len(given_names) > 0: authors.append(u'{0}, {1}'.format( surname[0], given_names[0]).strip(', ')) if len(authors) > 0: citation['authors'] = authors citation['contexts'] = [] citation['displayText'] = utopia.citation.format(citation) info['citations'].append(citation) info['citations_by_id'][citation['id']] = citation #print citation ####################################################################################### # Parse in-text citations if present min_length = 10 max_length = 20 for paragraph in html.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' article ')]//p" ): text_stack = [paragraph.text or ''] xref_stack = [None] for elem in paragraph: if len(elem.xpath('self::a[@class="xref-bibr"]')) > 0: text_stack.append( etree.tostring(elem, method='text', encoding=unicode, with_tail=False)) text_stack.append(elem.tail or '') xref = info['citations_by_id'].get( elem.get('href', '')[1:]) if xref is not None: xref_stack += [[xref], None] else: xref_stack += [[], None] elif isinstance(elem, etree._Entity): points = entities.get(elem.text[1:-1]) if points is not None: text_stack[-1] += ''.join( (unichr(p) for p in points)) else: text_stack[-1] += etree.tostring( elem, encoding=unicode) else: if elem.get('position') == 'float': text_stack[-1] += elem.tail or '' else: text_stack[-1] += etree.tostring( elem, method='text', encoding=unicode) # Find and collapse ranges in the text for i in xrange(len(xref_stack) - 3, 1, -2): text = text_stack[i].strip() #print len(text), (text in u'-\u2010\u2011\u2012\u2013\u2014\u2015'), u''.join(text_stack[i-1:i+2]).encode('utf8') # if this text is a dash, we need to coalesce the text fragments if len( text ) == 1 and text in u'-\u2010\u2011\u2012\u2013\u2014\u2015': text_stack[i - 1:i + 2] = [ u''.join(text_stack[i - 1:i + 2]) ] xref_stack[i - 1:i + 2] = [ xref_stack[i - 1] + xref_stack[i + 1] ] #for text in text_stack: # print text.encode('utf8') # Then make sure we resolve the implied citations for i in xrange(1, len(xref_stack), 2): # Get actual cross references xrefs = xref_stack[i] # Expand cross references try: if len(xrefs) == 2: labelfrom = int(xrefs[0].get('label')) labelto = int(xrefs[1].get('label')) candidates = {} midlabels = [ unicode(midlabel) for midlabel in xrange( labelfrom + 1, labelto) ] for candidate in info['citations']: if candidate.get('label') in midlabels: candidates[int(candidate.get( 'label'))] = candidate xrefs[1:-1] = candidates.values() except: raise # Find and collapse lists in the text for i in xrange(len(xref_stack) - 3, 1, -2): text = text_stack[i].strip() # if this text is a comma, we need to coalesce the text fragments if len(text) == 1 and text == ',': text_stack[i - 1:i + 2] = [ u''.join(text_stack[i - 1:i + 2]) ] xref_stack[i - 1:i + 2] = [ xref_stack[i - 1] + xref_stack[i + 1] ] # Expand citations to include brackets (on both sides) for i in xrange(len(xref_stack) - 2, 0, -2): before = text_stack[i - 1].strip()[-1:] text = text_stack[i].strip() after = text_stack[i + 1].strip()[:1] # if this text is a comma, we need to coalesce the text fragments #print before.encode('utf'), after.encode('utf') if len(before) > 0 and before in '({[' and len( after) > 0 and after in ')}]': text_stack[i - 1] = re.sub(r'[({[](\s*)$', r'\1', text_stack[i - 1]) text_stack[i + 1] = re.sub(r'^(\s*)[)}\]]', r'\1', text_stack[i + 1]) text_stack[i] = before + text_stack[i] + after #print repr(text_stack) for i in xrange(1, len(xref_stack), 2): # Get context before = u' '.join(text_stack[:i]).strip() label = text_stack[i].strip() after = u' '.join(text_stack[i + 1:]).strip() # Strip out extraneous brackets if len( xref_stack[i] ) > 1: # Hack to differentiate single / multiple citations # as multiple numbers tend not to have spaces between them label = re.sub( ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,])\s?[({[]?', r'\1', label) else: label = re.sub( ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,]\s?)[({[]?', r'\1', label) # Normalise context before = re.sub(r'\s+', ' ', before)[-max_length:].strip() label = re.sub(r'\s+', ' ', label) after = re.sub(r'\s+', ' ', after)[:max_length].strip() #print (before.encode('utf8'), label.encode('utf8'), after.encode('utf8')) if len(before + after) > min_length: for xref in xref_stack[i]: xref['contexts'].append((before, label, after)) #print xref_stack[i] ####################################################################################### # Parse tables if present info['tables'] = {} for table_url in html.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' table-inline ')]/div[contains(concat(' ', normalize-space(@class), ' '), ' callout ')]//li[1]/a/@href" ): table_url = urlparse.urljoin(citation_fulltext_html_url, table_url) #print table_url response = urllib2.urlopen(table_url, timeout=8) table_html = etree.parse(response, parser) for table_expansion in table_html.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' table-expansion ')]" ): id = table_expansion.get('id') table = {} table['xml'] = table_expansion.xpath('.//table[1]')[0] table['caption_raw'] = table_expansion.xpath( ".//span[contains(concat(' ', normalize-space(@class), ' '), ' caption-title ')][1]" )[0] if 'caption' not in table and 'caption_raw' in table: table['caption'] = table['caption_raw'] if 'caption' in table: table['caption'] = re.sub( r'\s+', ' ', etree.tostring(table['caption'], method='text', encoding=unicode).strip()) if 'xml' in table: table['xml'] = etree.tostring(table['xml'], encoding='utf8') info['tables'][id] = table #print table #print info if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Generate sensible titles / descriptions / icons? journalTitle = info.get('publication-title', '') journalTitleSuffix = '' publisher = info.get('publisher', 'the publisher') if len(journalTitle) > 0: journalTitleSuffix = ' ({0})'.format(journalTitle) # Create Metadata link annotation link = document.newAccList('metadata', 90) link['property:sourceIcon'] = '' link['property:sourceTitle'] = publisher link['property:sourceDescription'] = ''' <p>This information was provided by {0}{1}.</p> '''.format(publisher, journalTitleSuffix) # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation( citation) document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): #print (pre, label, post) matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format( citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) #print regex # convert oasis tables ns = { 'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table' } xml = etree.fromstring(table['xml']) if xml.tag == '{{{0}}}table'.format(ns['oasis']): for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns): columns = {} for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns): columns[colspec.get('colname')] = int( colspec.get('colnum')) for section in tgroup.xpath( './oasis:thead|./oasis:tbody', namespaces=ns): isHead = ( section.tag == '{{{0}}}thead'.format( ns['oasis'])) for row in section.xpath('./oasis:row', namespaces=ns): for entry in row.xpath('./oasis:entry', namespaces=ns): colname = entry.get('colname') colst = entry.get('namest') colend = entry.get('nameend') if colst is not None and colend is not None: colspan = columns[ colend] - columns[colst] + 1 else: colspan = 1 if colspan > 1: entry.set( 'colspan', unicode(colspan)) morerows = entry.get('morerows') if morerows is not None: rowspan = int(morerows) + 1 else: rowspan = 1 if rowspan > 1: entry.set( 'rowspan', unicode(rowspan)) entry.tag = 'td' row.tag = 'tr' if isHead: section.tag = 'thead' else: section.tag = 'tbody' xml.append(section) xml.tag = 'table' #print etree.tostring(xml, pretty_print=True, encoding='utf8') table['xml'] = etree.tostring(xml, encoding='utf8') matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): # See if we have any publishers' NLM hosted for this DOI doi = common.utils.metadata(document, 'doi') #print '----- DOI', doi if doi is not None: info = None try: url = 'https://utopia.cs.manchester.ac.uk/ext/hosted/nlm?' url += urllib.urlencode({'doi': doi.encode('utf8')}) nlm = urllib2.urlopen(url, timeout=8).read() info = common.nlm.parse(nlm) except (urllib2.URLError, socket.timeout): # info will remain None pass #print info if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Generate sensible titles / descriptions / icons? journalTitle = info.get('publication-title', '') journalTitleSuffix = '' publisher = info.get('publisher', 'the publisher') if len(journalTitle) > 0: journalTitleSuffix = ' ({0})'.format(journalTitle) # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = '' link['property:sourceTitle'] = publisher link['property:sourceDescription'] = ''' <p>This information was provided by {0}{1}.</p> '''.format(publisher, journalTitleSuffix) # Publisher identity if doi[:8] in ('10.1104/', '10.1105/'): annotation = spineapi.Annotation() annotation['concept'] = 'PublisherIdentity' logo = utopia.get_plugin_data_as_url('images/aspb_logo.png', 'image/png') webpageUrl = 'http://www.aspb.org/' title = publisher #print '====', publisher, '---', journalTitle, '---', webpageUrl if doi.startswith('10.1104/'): logo = utopia.get_plugin_data_as_url('images/pp_logo.png', 'image/png') title = journalTitle webpageUrl = 'http://www.plantphysiol.org/' elif doi.startswith('10.1105/'): logo = utopia.get_plugin_data_as_url('images/tpc_logo.png', 'image/png') title = journalTitle webpageUrl = 'http://www.plantcell.org/' annotation['property:logo'] = logo annotation['property:title'] = title annotation['property:webpageUrl'] = webpageUrl document.addAnnotation(annotation, 'PublisherMetadata') link['property:sourceIcon'] = logo link['property:sourceTitle'] = title # Create Metadata annotation annotation = spineapi.Annotation() annotation['concept'] = 'DocumentMetadata' for k in self.keys: v = info.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = spineapi.Annotation() annotation['concept'] = 'DocumentReference' for k in self.keys: v = citation.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation['concept'] = 'ForwardCitation' annotation['property:state'] = 'found' if 'title' in citation: annotation['property:title'] = citation['title'] if 'id' in citation: annotation['property:bibid'] = citation['id'] if 'doi' in citation and citation['doi'].startswith('10.1371/'): citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi'])) if 'pmcid' in citation: citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid']) for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'): if k in citation: annotation['property:{0}'.format(k)] = citation[k] #print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) #print citation except: raise pass # FIXME for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict = True) #print regex # convert oasis tables ns = {'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table'} xml = etree.fromstring(table['xml']) if xml.tag == '{{{0}}}table'.format(ns['oasis']): for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns): columns = {} for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns): columns[colspec.get('colname')] = int(colspec.get('colnum')) for section in tgroup.xpath('./oasis:thead|./oasis:tbody', namespaces=ns): isHead = (section.tag == '{{{0}}}thead'.format(ns['oasis'])) for row in section.xpath('./oasis:row', namespaces=ns): for entry in row.xpath('./oasis:entry', namespaces=ns): colname = entry.get('colname') colst = entry.get('namest') colend = entry.get('nameend') if colst is not None and colend is not None: colspan = columns[colend] - columns[colst] + 1 else: colspan = 1 if colspan > 1: entry.set('colspan', unicode(colspan)) morerows = entry.get('morerows') if morerows is not None: rowspan = int(morerows) + 1 else: rowspan = 1 if rowspan > 1: entry.set('rowspan', unicode(rowspan)) entry.tag = 'td' row.tag = 'tr' if isHead: section.tag = 'thead' else: section.tag = 'tbody' xml.append(section) xml.tag = 'table' #print etree.tostring(xml, pretty_print=True, encoding='utf8') table['xml'] = etree.tostring(xml, encoding='utf8') matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): info = utopialib.nlm.parse( utopialib.utils.metadata(document, 'raw_pmc_nlm')) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id # Create Metadata link annotation link = document.newAccList('metadata', 50) link['property:sourceDatabase'] = 'pmc' link['property:sourceTitle'] = 'PubMed Central' link[ 'property:sourceDescription'] = '<p><a href="http://www.ncbi.nlm.nih.gov/pmc/">PubMed Central</a> is the U.S. National Institutes of Health (NIH) digital archive of biomedical and life sciences journal literature.</p>' # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation(citation) document.addAnnotation(annotation, link['scratch']) # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format(citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise # Tables for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): # Only send if the DOI has a Springer prefix doi = utopialib.utils.metadata(document, 'identifiers[doi]') if doi is not None and doi[:7] in registrants: annotation = spineapi.Annotation() annotation['concept'] = 'PublisherIdentity' if False and doi.startswith( '10.1186/'): # This turns out not to be reliable annotation['property:logo'] = utopia.get_plugin_data_as_url( 'images/gigascience_logo.png', 'image/png') annotation['property:title'] = 'Giga Science' annotation[ 'property:webpageUrl'] = 'http://www.gigasciencejournal.com/' else: annotation['property:logo'] = utopia.get_plugin_data_as_url( 'images/logo.png', 'image/png') annotation['property:title'] = 'Springer' annotation['property:webpageUrl'] = 'http://www.springer.com/' document.addAnnotation(annotation, 'PublisherMetadata') # Make a request to the utopia ext web service url = 'https://utopia.cs.manchester.ac.uk/ext/springer/nlm?{0}' url = url.format(urllib.urlencode({'doi': doi})) try: nlm = urllib2.urlopen(url, timeout=8).read() except (urllib2.URLError, socket.timeout): return info = utopialib.nlm.parse(nlm) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/annotation_icon.png', 'image/png') link['property:sourceTitle'] = 'Springer' link['property:sourceDescription'] = ''' <p><a href="http://www.springer.com/">Springer</a> publishing company.</p> ''' # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation( citation) document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format( citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) print regex matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): doi = utopia.tools.utils.metadata(document, 'identifiers[doi]') if doi is not None: match = self.splitRegEx.match(doi) if match is not None: articleNumber = match.group('number') annotation = spineapi.Annotation() annotation['concept'] = 'PublisherIdentity' annotation['property:logo'] = utopia.get_plugin_data_as_url( 'images/logo.png', 'image/png') annotation['property:title'] = 'eLife' annotation[ 'property:webpageUrl'] = 'http://www.elifesciences.org/' document.addAnnotation(annotation, 'PublisherMetadata') # Turn all the DOIs that are sub-DOIs of this document into links regex = r'{0}\.\d+'.format(re.escape(doi)) for match in document.search(regex, spineapi.RegExp): url = 'http://dx.doi.org/{0}'.format(match.text()) annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = url annotation['session:volatile'] = '1' annotation.addExtent(match) document.addAnnotation(annotation) # Try to get the NLM directly from eLife url = 'http://elife.elifesciences.org/elife-source-xml/10.7554/eLife.{0}' url = url.format(articleNumber) try: nlm = urllib2.urlopen(url, timeout=8).read() except (urllib2.URLError, socket.timeout): return info = utopia.tools.nlm.parse(nlm) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopia.tools.eutils.efetch( id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext( 'ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format( id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Create Metadata link annotation link = document.newAccList('metadata', 100) link[ 'property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/annotation_icon.png', 'image/png') link['property:sourceTitle'] = 'eLife' link['property:sourceDescription'] = ''' <p>The <a href="http://www.elifesciences.org/">eLife</a> open access publishing platform.</p> ''' # Create Metadata annotation annotation = utopia.tools.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopia.tools.utils.citation_to_annotation( citation) document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = utopia.tools.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format( citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation( annotation, link['scratch']) except: raise for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) print regex matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): volume, page = None, None # Only send if the DOI has a Portland prefix doi = common.utils.metadata(document, 'doi') if doi is not None and doi[:7] in registrants: crossref_unixref = common.utils.metadata(document, 'raw_crossref_unixref') if crossref_unixref is not None: # Parse CrossRef redirect URL dom = etree.fromstring(crossref_unixref.encode('utf8')) resource = dom.findtext('doi_record/crossref/journal/journal_article/doi_data/resource') if resource is not None: match = self.resourceRegExp.match(resource) if match is not None: volume, page = match.groups() ### FIXME What information should be shown? Portland? BJ? #annotation = spineapi.Annotation() #annotation['concept'] = 'PublisherIdentity' #annotation['property:logo'] = utopia.get_plugin_data_as_url('images/logo.png', 'image/png') #annotation['property:title'] = 'Portland Press Limited' #annotation['property:webpageUrl'] = 'http://www.portlandpress.com/' #document.addAnnotation(annotation, 'PublisherMetadata') # If this document was resolved, off we go to fetch the NLM if None not in (volume, page): # Make a request to the utopia ext web service url = 'https://utopia.cs.manchester.ac.uk/ext/portland/nlm?{0}' url = url.format(urllib.urlencode({'volume': volume, 'page': page})) try: nlm = urllib2.urlopen(url, timeout=8).read() except: raise return info = common.nlm.parse(nlm) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/biochemj.png', 'image/png') link['property:sourceTitle'] = 'Portland' link['property:sourceDescription'] = ''' <p><a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p> ''' # Create Metadata annotation annotation = spineapi.Annotation() annotation['concept'] = 'DocumentMetadata' for k in self.keys: v = info.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = spineapi.Annotation() annotation['concept'] = 'DocumentReference' for k in self.keys: v = citation.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation['concept'] = 'ForwardCitation' annotation['property:state'] = 'found' if 'title' in citation: annotation['property:title'] = citation['title'] if 'id' in citation: annotation['property:bibid'] = citation['id'] if 'doi' in citation and citation['doi'].startswith('10.1371/'): citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi'])) if 'pmcid' in citation: citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid']) for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'): if k in citation: annotation['property:{0}'.format(k)] = citation[k] #print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) #print citation except: raise pass # FIXME for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict = True) #print regex matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): # See if we have any publishers' NLM hosted for this DOI doi = utopialib.utils.metadata(document, 'identifiers[doi]') #print '----- DOI', doi if doi is not None: info = None try: url = 'https://utopia.cs.manchester.ac.uk/ext/hosted/nlm?' url += urllib.urlencode({'doi': doi.encode('utf8')}) nlm = urllib2.urlopen(url, timeout=8).read() info = utopialib.nlm.parse(nlm) except (urllib2.URLError, socket.timeout): # info will remain None pass #print info if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Generate sensible titles / descriptions / icons? journalTitle = info.get('publication-title', '') journalTitleSuffix = '' publisher = info.get('publisher', 'the publisher') if len(journalTitle) > 0: journalTitleSuffix = ' ({0})'.format(journalTitle) # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = '' link['property:sourceTitle'] = publisher link['property:sourceDescription'] = ''' <p>This information was provided by {0}{1}.</p> '''.format(publisher, journalTitleSuffix) # Publisher identity if doi[:8] in ('10.1104/', '10.1105/'): annotation = spineapi.Annotation() annotation['concept'] = 'PublisherIdentity' logo = utopia.get_plugin_data_as_url( 'images/aspb_logo.png', 'image/png') webpageUrl = 'http://www.aspb.org/' title = publisher #print '====', publisher, '---', journalTitle, '---', webpageUrl if doi.startswith('10.1104/'): logo = utopia.get_plugin_data_as_url( 'images/pp_logo.png', 'image/png') title = journalTitle webpageUrl = 'http://www.plantphysiol.org/' elif doi.startswith('10.1105/'): logo = utopia.get_plugin_data_as_url( 'images/tpc_logo.png', 'image/png') title = journalTitle webpageUrl = 'http://www.plantcell.org/' annotation['property:logo'] = logo annotation['property:title'] = title annotation['property:webpageUrl'] = webpageUrl document.addAnnotation(annotation, 'PublisherMetadata') link['property:sourceIcon'] = logo link['property:sourceTitle'] = title # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation( citation) document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format( citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) #print regex # convert oasis tables ns = { 'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table' } xml = etree.fromstring(table['xml']) if xml.tag == '{{{0}}}table'.format(ns['oasis']): for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns): columns = {} for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns): columns[colspec.get('colname')] = int( colspec.get('colnum')) for section in tgroup.xpath( './oasis:thead|./oasis:tbody', namespaces=ns): isHead = ( section.tag == '{{{0}}}thead'.format( ns['oasis'])) for row in section.xpath('./oasis:row', namespaces=ns): for entry in row.xpath('./oasis:entry', namespaces=ns): colname = entry.get('colname') colst = entry.get('namest') colend = entry.get('nameend') if colst is not None and colend is not None: colspan = columns[ colend] - columns[colst] + 1 else: colspan = 1 if colspan > 1: entry.set( 'colspan', unicode(colspan)) morerows = entry.get('morerows') if morerows is not None: rowspan = int(morerows) + 1 else: rowspan = 1 if rowspan > 1: entry.set( 'rowspan', unicode(rowspan)) entry.tag = 'td' row.tag = 'tr' if isHead: section.tag = 'thead' else: section.tag = 'tbody' xml.append(section) xml.tag = 'table' #print etree.tostring(xml, pretty_print=True, encoding='utf8') table['xml'] = etree.tostring(xml, encoding='utf8') matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): doi = common.utils.metadata(document, 'doi') if doi is not None: info = {} # Resolve the DOI to find the publisher's website response = urllib2.urlopen('http://dx.doi.org/{0}'.format(doi), timeout=8) # Parse page to find (if there) the full text URL parser = etree.HTMLParser() html = etree.parse(response, parser) # Only continue if this is a highwire HTML page if len(html.xpath("/html/head/meta[@name='HW.identifier']")) == 0: return # Now make sure we have the full text XHTML citation_fulltext_html_url = html.xpath("/html/head/meta[@name='citation_fulltext_html_url']/@content") if len(citation_fulltext_html_url) > 0: citation_fulltext_html_url = citation_fulltext_html_url[0] # Fetch that full text page (if different to the current one) if citation_fulltext_html_url != response.geturl(): response = urllib2.urlopen(citation_fulltext_html_url, timeout=8) html = etree.parse(response, parser) #print etree.tostring(html, pretty_print=True, encoding='utf8') # Now parse out the bibliography info['citations'] = [] info['citations_by_id'] = {} for bibitem in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' ref-list ')]//ol[contains(concat(' ', normalize-space(@class), ' '), ' cit-list ')]/li"): citation = query(bibitem, { 'id': 'a/@id', 'label': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' ref-label ')]/text()", 'title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-article-title ')]/text()", 'year': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-date ')]/text()", 'publication-title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-jnl-abbrev ')]/text()", 'volume': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-vol ')]/text()", 'issue': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-issue ')]/text()", 'pagefrom': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-fpage ')]/text()", 'pageto': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-lpage ')]/text()", 'pmid': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-pmid ')]/text()", 'doi': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-doi ')]/text()", 'etree': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-metadata ')]", }) authors = [] for a in bibitem.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-auth ')]"): surname = a.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-surname ')]/text()") given_names = a.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-given-names ')]/text()") if len(surname) > 0 and len(given_names) > 0: authors.append(u'{0}, {1}'.format(surname[0], given_names[0]).strip(', ')) if len(authors) > 0: citation['authors'] = authors citation['contexts'] = [] citation['displayText'] = common.utils.format_citation(citation) info['citations'].append(citation) info['citations_by_id'][citation['id']] = citation #print citation ####################################################################################### # Parse in-text citations if present min_length = 10 max_length = 20 for paragraph in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' article ')]//p"): text_stack = [paragraph.text or ''] xref_stack = [None] for elem in paragraph: if len(elem.xpath('self::a[@class="xref-bibr"]')) > 0: text_stack.append(etree.tostring(elem, method='text', encoding=unicode, with_tail=False)) text_stack.append(elem.tail or '') xref = info['citations_by_id'].get(elem.get('href', '')[1:]) if xref is not None: xref_stack += [[xref], None] else: xref_stack += [[], None] elif isinstance(elem, etree._Entity): points = entities.get(elem.text[1:-1]) if points is not None: text_stack[-1] += ''.join((unichr(p) for p in points)) else: text_stack[-1] += etree.tostring(elem, encoding=unicode) else: if elem.get('position') == 'float': text_stack[-1] += elem.tail or '' else: text_stack[-1] += etree.tostring(elem, method='text', encoding=unicode) # Find and collapse ranges in the text for i in xrange(len(xref_stack) - 3, 1, -2): text = text_stack[i].strip() #print len(text), (text in u'-\u2010\u2011\u2012\u2013\u2014\u2015'), u''.join(text_stack[i-1:i+2]).encode('utf8') # if this text is a dash, we need to coalesce the text fragments if len(text) == 1 and text in u'-\u2010\u2011\u2012\u2013\u2014\u2015': text_stack[i-1:i+2] = [u''.join(text_stack[i-1:i+2])] xref_stack[i-1:i+2] = [xref_stack[i-1] + xref_stack[i+1]] #for text in text_stack: # print text.encode('utf8') # Then make sure we resolve the implied citations for i in xrange(1, len(xref_stack), 2): # Get actual cross references xrefs = xref_stack[i] # Expand cross references try: if len(xrefs) == 2: labelfrom = int(xrefs[0].get('label')) labelto = int(xrefs[1].get('label')) candidates = {} midlabels = [unicode(midlabel) for midlabel in xrange(labelfrom+1, labelto)] for candidate in info['citations']: if candidate.get('label') in midlabels: candidates[int(candidate.get('label'))] = candidate xrefs[1:-1] = candidates.values() except: raise # Find and collapse lists in the text for i in xrange(len(xref_stack) - 3, 1, -2): text = text_stack[i].strip() # if this text is a comma, we need to coalesce the text fragments if len(text) == 1 and text == ',': text_stack[i-1:i+2] = [u''.join(text_stack[i-1:i+2])] xref_stack[i-1:i+2] = [xref_stack[i-1] + xref_stack[i+1]] # Expand citations to include brackets (on both sides) for i in xrange(len(xref_stack) - 2, 0, -2): before = text_stack[i-1].strip()[-1:] text = text_stack[i].strip() after = text_stack[i+1].strip()[:1] # if this text is a comma, we need to coalesce the text fragments #print before.encode('utf'), after.encode('utf') if len(before) > 0 and before in '({[' and len(after) > 0 and after in ')}]': text_stack[i-1] = re.sub(r'[({[](\s*)$', r'\1', text_stack[i-1]) text_stack[i+1] = re.sub(r'^(\s*)[)}\]]', r'\1', text_stack[i+1]) text_stack[i] = before + text_stack[i] + after #print repr(text_stack) for i in xrange(1, len(xref_stack), 2): # Get context before = u' '.join(text_stack[:i]).strip() label = text_stack[i].strip() after = u' '.join(text_stack[i+1:]).strip() # Strip out extraneous brackets if len(xref_stack[i]) > 1: # Hack to differentiate single / multiple citations # as multiple numbers tend not to have spaces between them label = re.sub(ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,])\s?[({[]?', r'\1', label) else: label = re.sub(ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,]\s?)[({[]?', r'\1', label) # Normalise context before = re.sub(r'\s+', ' ', before)[-max_length:].strip() label = re.sub(r'\s+', ' ', label) after = re.sub(r'\s+', ' ', after)[:max_length].strip() #print (before.encode('utf8'), label.encode('utf8'), after.encode('utf8')) if len(before + after) > min_length: for xref in xref_stack[i]: xref['contexts'].append((before, label, after)) #print xref_stack[i] ####################################################################################### # Parse tables if present info['tables'] = {} for table_url in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' table-inline ')]/div[contains(concat(' ', normalize-space(@class), ' '), ' callout ')]//li[1]/a/@href"): table_url = urlparse.urljoin(citation_fulltext_html_url, table_url) #print table_url response = urllib2.urlopen(table_url, timeout=8) table_html = etree.parse(response, parser) for table_expansion in table_html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' table-expansion ')]"): id = table_expansion.get('id') table = {} table['xml'] = table_expansion.xpath('.//table[1]')[0] table['caption_raw'] = table_expansion.xpath(".//span[contains(concat(' ', normalize-space(@class), ' '), ' caption-title ')][1]")[0] if 'caption' not in table and 'caption_raw' in table: table['caption'] = table['caption_raw'] if 'caption' in table: table['caption'] = re.sub(r'\s+', ' ', etree.tostring(table['caption'], method='text', encoding=unicode).strip()) if 'xml' in table: table['xml'] = etree.tostring(table['xml'], encoding='utf8') info['tables'][id] = table #print table #print info if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Generate sensible titles / descriptions / icons? journalTitle = info.get('publication-title', '') journalTitleSuffix = '' publisher = info.get('publisher', 'the publisher') if len(journalTitle) > 0: journalTitleSuffix = ' ({0})'.format(journalTitle) # Create Metadata link annotation link = document.newAccList('metadata', 90) link['property:sourceIcon'] = '' link['property:sourceTitle'] = publisher link['property:sourceDescription'] = ''' <p>This information was provided by {0}{1}.</p> '''.format(publisher, journalTitleSuffix) # Create Metadata annotation annotation = spineapi.Annotation() annotation['concept'] = 'DocumentMetadata' for k in self.keys: v = info.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = spineapi.Annotation() annotation['concept'] = 'DocumentReference' for k in self.keys: v = citation.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): #print (pre, label, post) matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation['concept'] = 'ForwardCitation' annotation['property:state'] = 'found' if 'title' in citation: annotation['property:title'] = citation['title'] if 'id' in citation: annotation['property:bibid'] = citation['id'] if 'doi' in citation and citation['doi'].startswith('10.1371/'): citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi'])) if 'pmcid' in citation: citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid']) for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'): if k in citation: annotation['property:{0}'.format(k)] = citation[k] #print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) #print citation except: raise pass # FIXME for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict = True) #print regex # convert oasis tables ns = {'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table'} xml = etree.fromstring(table['xml']) if xml.tag == '{{{0}}}table'.format(ns['oasis']): for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns): columns = {} for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns): columns[colspec.get('colname')] = int(colspec.get('colnum')) for section in tgroup.xpath('./oasis:thead|./oasis:tbody', namespaces=ns): isHead = (section.tag == '{{{0}}}thead'.format(ns['oasis'])) for row in section.xpath('./oasis:row', namespaces=ns): for entry in row.xpath('./oasis:entry', namespaces=ns): colname = entry.get('colname') colst = entry.get('namest') colend = entry.get('nameend') if colst is not None and colend is not None: colspan = columns[colend] - columns[colst] + 1 else: colspan = 1 if colspan > 1: entry.set('colspan', unicode(colspan)) morerows = entry.get('morerows') if morerows is not None: rowspan = int(morerows) + 1 else: rowspan = 1 if rowspan > 1: entry.set('rowspan', unicode(rowspan)) entry.tag = 'td' row.tag = 'tr' if isHead: section.tag = 'thead' else: section.tag = 'tbody' xml.append(section) xml.tag = 'table' #print etree.tostring(xml, pretty_print=True, encoding='utf8') table['xml'] = etree.tostring(xml, encoding='utf8') matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): info = common.nlm.parse(common.utils.metadata(document, "raw_pmc_nlm")) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding="utf8") pmids = dict( ( (citation["pmid"], citation["id"]) for citation in info["citations"] if "pmid" in citation and "id" in citation ) ) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( common.eutils.efetch(id=",".join(pmids.keys()), retmode="xml", rettype="abstract"), parser ) for idList in pubmed_abstracts.xpath("PubmedArticle/PubmedData/ArticleIdList"): # print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info["citations_by_id"][pmids[pmid]] for key_name, id_name in (("doi", "doi"), ("pmcid", "pmc"), ("pii", "pii")): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id # Create Metadata link annotation link = document.newAccList("metadata", 50) link["property:sourceDatabase"] = "pmc" link["property:sourceTitle"] = "PubMed Central" link[ "property:sourceDescription" ] = '<p><a href="http://www.ncbi.nlm.nih.gov/pmc/">PubMed Central</a> is the U.S. National Institutes of Health (NIH) digital archive of biomedical and life sciences journal literature.</p>' # Create Metadata annotation annotation = spineapi.Annotation() annotation["concept"] = "DocumentMetadata" for k in self.keys: v = info.get(k) if v is not None: annotation["property:{0}".format(k)] = v document.addAnnotation(annotation, link["scratch"]) # Create Bibliography annotations for citation in info.get("citations", []): annotation = spineapi.Annotation() annotation["concept"] = "DocumentReference" for k in self.keys: v = citation.get(k) if v is not None: annotation["property:{0}".format(k)] = v document.addAnnotation(annotation, link["scratch"]) # Citations for citation in info["citations"]: # Find cross refs for pre, label, post in citation.get("contexts", []): matches = document.findInContext(pre, label, post) # print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation["concept"] = "ForwardCitation" annotation["property:state"] = "found" if "title" in citation: annotation["property:title"] = citation["title"] if "id" in citation: annotation["property:bibid"] = citation["id"] if "doi" in citation and citation["doi"].startswith("10.1371/"): citation[ "pdf" ] = "http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF".format( "info:doi/{0}".format(citation["doi"]) ) if "pmcid" in citation: citation["pdf"] = "http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/".format( citation["pmcid"] ) # print citation for k in self.keys + ("authors", "pdf", "first_author_surname"): if k in citation: annotation["property:{0}".format(k)] = citation[k] # print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link["scratch"]) # print citation except: raise pass # FIXME # Tables for id, table in info.get("tables", {}).iteritems(): if "caption" in table and "xml" in table: regex = fuzz(table["caption"], strict=True) matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation["concept"] = "Table" annotation[ "session:upload_files" ] = "data:application/xml;name=data.xml;base64,%s" % base64.standard_b64encode(table["xml"]) annotation.addExtent(matches[0]) document.addAnnotation(annotation, link["scratch"]) else: print "*********** failed to match table:", id
def on_ready_event(self, document): doi = common.utils.metadata(document, "doi", "") match = self.splitRegEx.match(doi) if match is not None: articleNumber = match.group("number") annotation = spineapi.Annotation() annotation["concept"] = "PublisherIdentity" annotation["property:logo"] = utopia.get_plugin_data_as_url("images/logo.png", "image/png") annotation["property:title"] = "eLife" annotation["property:webpageUrl"] = "http://www.elifesciences.org/" document.addAnnotation(annotation, "PublisherMetadata") # Turn all the DOIs that are sub-DOIs of this document into links regex = r"{0}\.\d+".format(re.escape(doi)) for match in document.search(regex, spineapi.RegExp): url = "http://dx.doi.org/{0}".format(match.text()) annotation = spineapi.Annotation() annotation["concept"] = "Hyperlink" annotation["property:webpageUrl"] = url annotation["session:volatile"] = "1" annotation.addExtent(match) document.addAnnotation(annotation) # Try to get the NLM directly from eLife url = "http://elife.elifesciences.org/elife-source-xml/10.7554/eLife.{0}" url = url.format(articleNumber) try: nlm = urllib2.urlopen(url, timeout=8).read() except (urllib2.URLError, socket.timeout): return info = common.nlm.parse(nlm) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding="utf8") pmids = dict( ( (citation["pmid"], citation["id"]) for citation in info["citations"] if "pmid" in citation and "id" in citation ) ) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( common.eutils.efetch(id=",".join(pmids.keys()), retmode="xml", rettype="abstract"), parser ) for idList in pubmed_abstracts.xpath("PubmedArticle/PubmedData/ArticleIdList"): # print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info["citations_by_id"][pmids[pmid]] for key_name, id_name in (("doi", "doi"), ("pmcid", "pmc"), ("pii", "pii")): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id # print 'KEY', key_name, id # Create Metadata link annotation link = document.newAccList("metadata", 100) link["property:sourceIcon"] = utopia.get_plugin_data_as_url("images/annotation_icon.png", "image/png") link["property:sourceTitle"] = "eLife" link[ "property:sourceDescription" ] = """ <p>The <a href="http://www.elifesciences.org/">eLife</a> open access publishing platform.</p> """ # Create Metadata annotation annotation = spineapi.Annotation() annotation["concept"] = "DocumentMetadata" for k in self.keys: v = info.get(k) if v is not None: annotation["property:{0}".format(k)] = v document.addAnnotation(annotation, link["scratch"]) # Create Bibliography annotations for citation in info.get("citations", []): annotation = spineapi.Annotation() annotation["concept"] = "DocumentReference" for k in self.keys: v = citation.get(k) if v is not None: annotation["property:{0}".format(k)] = v document.addAnnotation(annotation, link["scratch"]) ####################################################################################### # Apply parsed data to document # Citations for citation in info["citations"]: # Find cross refs for pre, label, post in citation.get("contexts", []): matches = document.findInContext(pre, label, post) # print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation["concept"] = "ForwardCitation" annotation["property:state"] = "found" if "title" in citation: annotation["property:title"] = citation["title"] if "id" in citation: annotation["property:bibid"] = citation["id"] if "doi" in citation and citation["doi"].startswith("10.1371/"): citation[ "pdf" ] = "http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF".format( "info:doi/{0}".format(citation["doi"]) ) if "pmcid" in citation: citation["pdf"] = "http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/".format( citation["pmcid"] ) for k in ( "displayText", "label", "pdf", "pmid", "pmc", "pii", "doi", "first_author_surname", "year", "journal", "volume", "page_from", ): if k in citation: annotation["property:{0}".format(k)] = citation[k] # print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link["scratch"]) # print citation except: raise pass # FIXME for id, table in info.get("tables", {}).iteritems(): if "caption" in table and "xml" in table: regex = fuzz(table["caption"], strict=True) print regex matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation["concept"] = "Table" annotation[ "session:upload_files" ] = "data:application/xml;name=data.xml;base64,%s" % base64.standard_b64encode(table["xml"]) annotation["session:volatile"] = "1" annotation.addExtent(matches[0]) document.addAnnotation(annotation, link["scratch"]) else: print "*********** failed to match table:", id
def on_load_event(self, document): # Keep track of errors so that we can inform the user def add_error(component, method, category=None, message=None, exception=None): if exception is not None: if isinstance(exception, urllib2.URLError) and isinstance(exception.reason, socket.timeout): exception = exception.reason if isinstance(exception, socket.timeout): category = "timeout" message = "The server did not respond" elif isinstance(exception, urllib2.HTTPError): category = "server" message = unicode(getattr(exception, "reason", "The server did not respond as expected")) elif isinstance(exception, urllib2.URLError): category = "connection" message = unicode(getattr(exception, "reason", "The server could not be found")) error = spineapi.Annotation() error["concept"] = "Error" error["property:component"] = component error["property:method"] = method error["property:category"] = category if message is not None: error["property:message"] = message document.addAnnotation(error, "errors.metadata") def add_success(component, method): error = spineapi.Annotation() error["concept"] = "Success" error["property:component"] = component error["property:method"] = method error["property:category"] = "success" document.addAnnotation(error, "errors.metadata") metadata = {"scraped": {}, "arxiv": {}, "pubmed": {}, "pmc": {}, "crossref": {}, "utopia": {}} authors = [] publication = None volume = None issue = None year = None pages = None ################################################################################# # Scrape DOI and title doi = common.doi.scrape(document) metadata["scraped"]["doi"] = doi print "scraper: doi:", (doi and doi.encode("utf8")) title = common.title.scrape(document) metadata["scraped"]["title"] = title print "scraper: title:", (title and title.encode("utf8")) ################################################################################# # Scrape arXiv ID arxivid = common.arxiv.scrape(document) if arxivid is not None: metadata["scraped"]["arxivid"] = arxivid try: arxiv_results = common.arxiv.resolve(arxivid) if arxiv_results is not None: arxiv_results.update({":whence": "arxiv", ":weight": 10}) common.utils.store_metadata(document, **arxiv_results) except Exception as e: add_error("ArXiv", "resolve", exception=e) traceback.print_exc() else: add_success("ArXiv", "resolve") ################################################################################# # Fold in the CrossRef data issn = common.utils.metadata(document, "issn") if title is not None or doi is not None: if doi is None: try: xref_results = common.crossref.search(title) if len(xref_results) == 1: xref_title = xref_results[0].get("title") if xref_title is not None: print "crossref: resolved title:", xref_title.encode("utf8") # Accept the crossref title if present in the document (do magic dash pattern thing) xref_title = re.sub( ur"[^-\u002D\u007E\u00AD\u058A\u05BE\u1400\u1806\u2010-\u2015\u2053\u207B\u208B\u2212\u2E17\u2E3A\u2E3B\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", lambda x: re.escape(x.group(0)), xref_title, ) xref_title = re.sub( ur"[\u002D\u007E\u00AD\u058A\u05BE\u1400\u1806\u2010-\u2015\u2053\u207B\u208B\u2212\u2E17\u2E3A\u2E3B\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D-]+", lambda x: r"\p{{Pd}}{{{0}}}".format(len(x.group(0))), xref_title, ) # print 'crossref: resolved title pattern:', xref_title.encode('utf8') matches = document.search(xref_title, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) > 0: doi = xref_results[0].get("doi") print "crossref: accepting resolved doi" except Exception as e: add_error("CrossRef", "search", exception=e) traceback.print_exc() else: add_success("CrossRef", "search") if doi is not None: # What is this DOI's article's title according to crossref? try: xref_results = common.crossref.resolve(doi) xref_results.update({":whence": "crossref", ":weight": 20}) xref_title = xref_results.get("title", "") if len(xref_title) > 0: print "crossref: resolved title:", xref_title.encode("utf8") if ( re.sub(r"[^\w]+", " ", title).strip() == re.sub(r"[^\w]+", " ", xref_title).strip() ): # Fuzzy match print "crossref: titles match precisely" common.utils.store_metadata(document, **xref_results) else: # Accept the crossref title over the scraped title, if present in the document matches = document.findInContext("", xref_title, "") # Fuzzy match if len(matches) > 0: common.utils.store_metadata(document, **xref_results) title = xref_title print "crossref: overriding scraped title with crossref title" else: print "crossref: ignoring resolved metadata" # FIXME should we discard the DOI at this point? except Exception as e: add_error("CrossRef", "resolve", exception=e) traceback.print_exc() else: add_success("CrossRef", "resolve") ########################################################################################### # Fold in the PubMed data pii = common.utils.metadata(document, "pii") pmid = common.utils.metadata(document, "pmid") pmcid = common.utils.metadata(document, "pmcid") if pmid is None and doi is not None: # resolve on DOI try: pmid = common.pubmed.resolve(doi, "doi") except Exception as e: add_error("PubMed", "resolve", exception=e) traceback.print_exc() else: add_success("PubMed", "resolve") if pmid is None and title is not None: # resolve on title try: pubmed_results = common.pubmed.search(title) pubmed_title = pubmed_results.get("title", "").strip(" .") if len(pubmed_title) > 0: print "pubmed: resolved title:", pubmed_title.encode("utf8") pubmed_pmid = pubmed_results.get("pmid") print "pubmed: resolved pmid:", pubmed_pmid if ( re.sub(r"[^\w]+", " ", title).strip() == re.sub(r"[^\w]+", " ", pubmed_title).strip() ): # Fuzzy match print "pubmed: titles match precisely" title = pubmed_title pmid = pubmed_pmid else: # Accept the pubmed title over the scraped title, if present in the document matches = document.findInContext("", pubmed_title, "") # Fuzzy match if len(matches) > 0: title = matches[0].text() pmid = pubmed_pmid print "pubmed: overriding scraped title with pubmed title" else: print "pubmed: ignoring resolved title" except Exception as e: add_error("PubMed", "search", exception=e) traceback.print_exc() else: add_success("PubMed", "search") if pmid is not None: try: nlm = common.pubmed.fetch(pmid) if nlm is not None: xml = etree.fromstring(nlm) pubmed_authors = [] for author in xml.findall("PubmedArticle/MedlineCitation/Article/AuthorList/Author"): name = u"" lastName = author.findtext("LastName") forename = author.findtext("ForeName") if lastName is not None: name = lastName + u", " if forename is not None: name += forename if len(name) > 0: pubmed_authors.append(name) if len(pubmed_authors) == 0: pubmed_authors = None pubmed_pmid = xml.findtext("PubmedArticle/MedlineCitation/PMID") common.utils.store_metadata( document, **{ ":whence": "pubmed", ":weight": 10, "raw_pubmed_nlm": nlm, "authors": pubmed_authors, "pmid": pubmed_pmid, "title": xml.findtext("PubmedArticle/MedlineCitation/Article[1]/ArticleTitle"), "issn": xml.findtext("PubmedArticle/MedlineCitation/Article/Journal/ISSN[1]"), "doi": xml.findtext('PubmedArticle/PubmedData/ArticleIdList/ArticleId[@IdType="doi"]'), "pmcid": xml.findtext('PubmedArticle/PubmedData/ArticleIdList/ArticleId[@IdType="pmc"]'), "pii": xml.findtext('PubmedArticle/PubmedData/ArticleIdList/ArticleId[@IdType="pii"]'), "publication-title": xml.findtext("PubmedArticle/MedlineCitation/Article/Journal/Title"), "volume": xml.findtext("PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/Volume"), "issue": xml.findtext("PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/Issue"), "year": xml.findtext( "PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/PubDate/Year" ), "pages": xml.findtext("PubmedArticle/MedlineCitation/Article[1]/Pagination/MedlinePgn"), "abstract": xml.findtext("PubmedArticle/MedlineCitation/Article[1]/Abstract/AbstractText"), } ) pmid = pubmed_pmid or pmid # FIXME I'm sure the above should be in common.pubmed except Exception as e: add_error("PubMed", "fetch", exception=e) traceback.print_exc() else: add_success("PubMed", "fetch") ########################################################################################### # Fold in the PubMedCentral data if pmcid is None and doi is not None: # resolve on DOI try: pmcid = common.pmc.resolve(doi, "doi") except Exception as e: add_error("PubMed Central", "resolve", exception=e) traceback.print_exc() else: add_success("PubMed Central", "resolve") if pmcid is None and pmid is not None: # resolve on PubMed ID try: pmcid = common.pmc.resolve(pmid, "pmid") except Exception as e: add_error("PubMed Central", "resolve", exception=e) traceback.print_exc() else: add_success("PubMed Central", "resolve") if pmcid is not None: common.utils.store_metadata(document, **{":whence": "pmc", ":weight": 10, "pmcid": pmcid}) try: nlm = common.pmc.fetch(pmcid) if nlm is not None: common.utils.store_metadata(document, **{":whence": "pmc", ":weight": 10, "raw_pmc_nlm": nlm}) except Exception as e: add_error("PubMed Central", "fetch", exception=e) traceback.print_exc() else: add_success("PubMed Central", "fetch") ########################################################################################### scraped = metadata["scraped"] scraped.update({":whence": "document", ":weight": 5}) common.utils.store_metadata(document, **scraped)