#!/usr/bin/env python3 from __future__ import print_function from os import environ from collections import defaultdict from hypothesis import HypothesisUtils, HypothesisAnnotation username = environ.get('RRIDBOT_USERNAME', 'USERNAME') # Hypothesis account password = environ.get('RRIDBOT_PASSWORD', 'PASSWORD') group = environ.get('RRIDBOT_GROUP', '__world__') print(username, group) # sanity check h = HypothesisUtils(username=username, password=password, group=group, max_results=5000) h.login() params = {'group' : h.group } rows = h.search_all(params) annos = [HypothesisAnnotation(row) for row in rows] annotated_urls = defaultdict(list) for anno in annos: annotated_urls[anno.uri].append(anno) html = """<html> <head><style> body { font-family:verdana;margin:.75in } .anno { margin: 20px; border-style: solid; border-width: thin; padding: 20px; } .text { margin:20px } .article { font-size:larger } </style></head> <body>"""
def rrid(request): """ Receive an article, parse RRIDs, resolve them, create annotations, log results """ if request.method == 'OPTIONS': response = Response() request_headers = request.headers['Access-Control-Request-Headers'].lower() request_headers = re.findall('\w(?:[-\w]*\w)', request_headers) response_headers = ['access-control-allow-origin'] for req_acoa_header in request_headers: if req_acoa_header not in response_headers: response_headers.append(req_acoa_header) response_headers = ','.join(response_headers) response.headers.update({ 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Headers': '%s' % response_headers }) response.status_int = 204 return response h = HypothesisUtils(username=username, password=password, group=group) h.login() target_uri = urlparse.parse_qs(request.text)['uri'][0] api_query = 'https://hypothes.is/api/search?limit=200&uri=' + target_uri obj = h.authenticated_api_query(api_query) rows = obj['rows'] tags = set() for row in rows: if row['group'] != h.group: # api query returns unwanted groups continue elif row['user'] != 'acct:' + h.username + '@hypothes.is': continue for tag in row['tags']: if tag.startswith('RRID'): tags.add(tag) html = urlparse.parse_qs(request.text)['data'][0] print(target_uri) found_rrids = {} try: matches = re.findall('(.{0,10})(RRID:\s*)([_\w\-:]+)([^\w].{0,10})', html.replace('–','-')) existing = [] for match in matches: print(match) prefix = match[0] exact = match[2] if 'RRID:'+exact in tags: print('skipping %s, already annotated' % exact) continue new_tags = [] if exact in existing: new_tags.append('RRID:Duplicate') else: existing.append(exact) found_rrids[exact] = None suffix = match[3] print('\t' + exact) resolver_uri = 'https://scicrunch.org/resolver/%s.xml' % exact r = requests.get(resolver_uri) print(r.status_code) xml = r.content found_rrids[exact] = r.status_code if r.status_code < 300: root = etree.fromstring(xml) if root.findall('error'): s = 'Resolver lookup failed.' s += '<hr><p><a href="%s">resolver lookup</a></p>' % resolver_uri r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, prefix=prefix, exact=exact, suffix=suffix, text=s, tags=new_tags + ['RRID:Unresolved']) print('ERROR') else: data_elements = root.findall('data')[0] s = '' data_elements = [(e.find('name').text, e.find('value').text) for e in data_elements] # these shouldn't duplicate citation = [(n, v) for n, v in data_elements if n == 'Proper Citation'] name = [(n, v) for n, v in data_elements if n == 'Name'] data_elements = citation + name + sorted([(n, v) for n, v in data_elements if (n != 'Proper Citation' or n != 'Name') and v is not None]) for name, value in data_elements: if (name == 'Reference' or name == 'Mentioned In Literature') and value is not None and value.startswith('<a class'): if len(value) > 500: continue # nif-0000-30467 fix keep those pubmed links short! s += '<p>%s: %s</p>' % (name, value) s += '<hr><p><a href="%s">resolver lookup</a></p>' % resolver_uri r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, prefix=prefix, exact=exact, suffix=suffix, text=s, tags=new_tags) else: s = 'Resolver lookup failed.' r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, prefix=prefix, exact=exact, suffix=suffix, text=s, tags=new_tags + ['RRID:Unresolved']) except: print('error: %s' % exact) print(traceback.print_exc()) results = ', '.join(found_rrids.keys()) r = Response(results) r.content_type = 'text/plain' r.headers.update({ 'Access-Control-Allow-Origin': '*' }) try: now = datetime.now().isoformat()[0:19].replace(':','').replace('-','') fname = 'rrid-%s.log' % now s = 'URL: %s\n\nResults: %s\n\nCount: %s\n\nText:\n\n%s' % ( target_uri, results, len(found_rrids), html ) with open(fname, 'wb') as f: f.write(s.encode('utf-8')) except: print(traceback.print_exc()) return r