def export_json_impl(): h = HypothesisUtils(username=username, token=api_token, group=group, max_results=100000) params = {'group' : h.group } rows = h.search_all(params) annos = [HypothesisAnnotation(row) for row in rows] # clean up bugs from old curation workflow for anno in annos: if anno.tags: new_tags = [] for tag in anno.tags: if tag in bad_tags: new_tags.append(tag.replace('RRID:', 'RRIDCUR:')) # scibot made a mistake early, might be able to correct tags in bulk someday else: new_tags.append(tag) # horribly inefficient... anno.tags = new_tags if anno.text.startswith('RRID:'): # catch cases where the RRID was put in text instead of in tags if 'RRIDCUR:Missing' in anno.tags or 'RRIDCUR:Unrecognized' in anno.tags: rtag = anno.text.split(None,1)[0] # trap for cases where there is more text after an RRID... if rtag not in anno.tags: anno.tags.append(rtag) print('TEXT ISSUE for %s at https://hyp.is/%s' % (anno.user, anno.id)) elif anno.exact and anno.exact.startswith('RRID:'): # this needs to go second in case of RRIDCUR:Incorrect if anno.exact.startswith('RRID: '): # deal with nospace first rtag = anno.exact.replace('RRID: ', 'RRID:') else: rtag = anno.exact rtag = rtag.split(None,1)[0] # trap more if rtag not in anno.tags: if anno.user == 'scibot' and len(anno.tags) == 1 and anno.tags[0].startswith('RRID:RRID:'): # FIXME HACK anno.tags = [rtag] else: pass # anything else we detect in the data doesn't need to be corrected or used to fix tags output_json = [anno.__dict__ for anno in annos] DATE = date.today().strftime('%Y-%m-%d') return output_json, DATE
def rrid_wrapper(request, username, api_token, group, logloc): """ Receive an article, parse RRIDs, resolve them, create annotations, log results """ if request.method == 'OPTIONS': response = Response() request_headers = request.headers['Access-Control-Request-Headers'].lower() request_headers = re.findall('\w(?:[-\w]*\w)', request_headers) response_headers = ['access-control-allow-origin'] for req_acoa_header in request_headers: if req_acoa_header not in response_headers: response_headers.append(req_acoa_header) response_headers = ','.join(response_headers) response.headers.update({ 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Headers': '%s' % response_headers }) response.status_int = 204 return response h = HypothesisUtils(username=username, token=api_token, group=group) target_uri = urlparse.parse_qs(request.text)['uri'][0] params = { 'limit':200, 'uri':target_uri } query_url = h.query_url_template.format(query=urlencode(params, True)) obj = h.authenticated_api_query(query_url) rows = obj['rows'] tags = set() for row in rows: if row['group'] != h.group: # api query returns unwanted groups continue elif row['user'] != 'acct:' + h.username + '@hypothes.is': continue for tag in row['tags']: if tag.startswith('RRID'): tags.add(tag) html = urlparse.parse_qs(request.text)['data'][0] print(target_uri) found_rrids = {} try: matches = re.findall('(.{0,10})(RRID(:|\)*,*)[ \t]*)(\w+[_\-:]+[\w\-]+)([^\w].{0,10})', html.replace('–','-')) existing = [] for match in matches: print(match) prefix = match[0] exact = match[3] if 'RRID:'+exact in tags: print('skipping %s, already annotated' % exact) continue new_tags = [] if exact in existing: new_tags.append('RRIDCUR:Duplicate') else: existing.append(exact) found_rrids[exact] = None suffix = match[4] print('\t' + exact) resolver_uri = 'https://scicrunch.org/resolver/%s.xml' % exact r = requests.get(resolver_uri) print(r.status_code) xml = r.content found_rrids[exact] = r.status_code if r.status_code < 300: root = etree.fromstring(xml) if root.findall('error'): s = 'Resolver lookup failed.' s += '<hr><p><a href="%s">resolver lookup</a></p>' % resolver_uri r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, prefix=prefix, exact=exact, suffix=suffix, text=s, tags=new_tags + ['RRIDCUR:Unresolved']) print('ERROR') else: data_elements = root.findall('data')[0] s = '' data_elements = [(e.find('name').text, e.find('value').text) for e in data_elements] # these shouldn't duplicate citation = [(n, v) for n, v in data_elements if n == 'Proper Citation'] name = [(n, v) for n, v in data_elements if n == 'Name'] data_elements = citation + name + sorted([(n, v) for n, v in data_elements if (n != 'Proper Citation' or n != 'Name') and v is not None]) for name, value in data_elements: if (name == 'Reference' or name == 'Mentioned In Literature') and value is not None and value.startswith('<a class'): if len(value) > 500: continue # nif-0000-30467 fix keep those pubmed links short! s += '<p>%s: %s</p>' % (name, value) s += '<hr><p><a href="%s">resolver lookup</a></p>' % resolver_uri r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, prefix=prefix, exact=exact, suffix=suffix, text=s, tags=new_tags + ['RRID:'+exact]) else: s = 'Resolver lookup failed.' r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, prefix=prefix, exact=exact, suffix=suffix, text=s, tags=new_tags + ['RRIDCUR:Unresolved']) except: print(traceback.print_exc()) results = ', '.join(found_rrids.keys()) r = Response(results) r.content_type = 'text/plain' r.headers.update({ 'Access-Control-Allow-Origin': '*' }) try: now = datetime.now().isoformat()[0:19].replace(':','').replace('-','') fname = logloc + 'rrid-%s.log' % now s = 'URL: %s\n\nResults: %s\n\nCount: %s\n\nText:\n\n%s' % ( target_uri, results, len(found_rrids), html ) with open(fname, 'wb') as f: f.write(s.encode('utf-8')) except: print(traceback.print_exc()) return r
#!/usr/bin/env python3 from __future__ import print_function from os import environ from collections import defaultdict from hypothesis import HypothesisUtils, HypothesisAnnotation username = environ.get('RRIDBOT_USERNAME', 'USERNAME') # Hypothesis account password = environ.get('RRIDBOT_PASSWORD', 'PASSWORD') group = environ.get('RRIDBOT_GROUP', '__world__') print(username, group) # sanity check h = HypothesisUtils(username=username, password=password, group=group, max_results=5000) h.login() params = {'group' : h.group } rows = h.search_all(params) annos = [HypothesisAnnotation(row) for row in rows] annotated_urls = defaultdict(list) for anno in annos: annotated_urls[anno.uri].append(anno) html = """<html> <head><style> body { font-family:verdana;margin:.75in } .anno { margin: 20px; border-style: solid; border-width: thin; padding: 20px; } .text { margin:20px } .article { font-size:larger } </style></head> <body>"""
def export_impl(): h = HypothesisUtils(username=username, token=api_token, group=group, max_results=100000) params = {'group' : h.group } rows = h.search_all(params) annos = [HypothesisAnnotation(row) for row in rows] annotated_urls = defaultdict(list) for anno in annos: annotated_urls[anno.uri].append(anno) fix_trailing_slash(annotated_urls) output_rows = [] for annotated_url in annotated_urls.keys(): #print(annotated_url) annos = annotated_urls[annotated_url] replies = defaultdict(list) PMID = [] for anno in annos: # gotta build the reply structure and get pmid #print('id:', anno.id) #print('user:'******'exact:', anno.exact) #print('text:', anno.text) #print('tags:', anno.tags) #print('type:', anno.type) #print('references:', anno.references) if anno.references: for reference in anno.references: # shouldn't there only be one??? replies[reference].append(anno) PMID.extend([tag for tag in anno.tags if tag.startswith('PMID:') and '_' not in tag]) # bad tags with PMID:SCR_ #curators didn't put the pmid in as tags :( if anno.text.startswith('PMID:'): # DANGER ZONE if '_' in anno.text: print('PMIDS DONT HAVE UNDERSCORES PROBABLY CURATION BUG', anno.text) else: PMID.append(anno.text.strip()) # because, yep, when you don't tag sometimes you get \n :/ if PMID: if len(PMID) > 1: print(PMID, annotated_url) if PMID[0] == PMID[1]: PMID = PMID[0] print('WARNING: more than one pmid tag') else: raise BaseException('more than one pmid tag') else: PMID = PMID[0] #print(PMID) else: all_tags = [] for a in annos: all_tags.extend(a.tags) #print('NO PMID FOR', annotated_url) #print(set([a.user for a in annos])) #print(all_tags) PMID = annotated_url RRIDs = defaultdict(list) EXACTs = {} CITEs = {} #USERs = {} for anno in annos: RRID = None additional = [] for tag in anno.tags: if re.match('RRID:.+[0-9]+.+', tag): # ARRRRGGGGHHHHHHH ARRRRGGHHHH #if re.match('RRID:.+', tag): # ARRRRGGGGHHHHHHH ARRRRGGHHHH if RRID is not None: raise BaseException('MORE THAN ONE RRID PER ENTRY!') RRID = tag # :/ this works for now but ARHGHHGHASFHAS else: additional.append(tag) # eg Unresolved if tag == 'RRIDCUR:Missing': # fix for bad curation process maybe_rrid = anno.text.strip() if re.match('RRID:.+[0-9]+', maybe_rrid): # ARRRRGGGGHHHHHHH ARRRRGGHHHH RRID = maybe_rrid # RRIDCUR:Missing was already added above if RRID is not None: EXACTs[RRID] = anno.exact.strip() if anno.exact else '' RRIDs[RRID].extend(additional) #USERs[RRID] = anno.user if RRID not in CITEs: if anno.text: if 'Proper Citation:' in anno.text: CITEs[RRID] = anno.text.split('Proper Citation:')[1].strip().split('<',1)[0] if anno.id in replies: for r_anno in replies[anno.id]: RRIDs[RRID].extend(r_anno.tags) # not worrying about the text here elif not anno.references and PMID not in anno.tags: # this is an independent annotation which will not be included new = 'NONE:' + anno.id RRIDs[new].append('') EXACTs[new] = anno.exact #USERs[RRID] = anno.user for rrid, more in RRIDs.items(): #FIXME TOOOOOO SLOW #r = requests.get('https://scicrunch.org/resolver/{RRID}.xml'.format(RRID=rrid)) #if r.status_code < 300: #proper_citation = get_proper_citation(r.content) #else: #proper_citation = '' try: proper_citation = CITEs[rrid] except KeyError: # FIXME this is a hack to avoid some cases of LWW for citations proper_citation = '' if not more: row = [PMID, rrid, '', annotated_url, EXACTs[rrid], proper_citation] output_rows.append(row) else: for val in set(more): # cull dupes row = [PMID, rrid, val, annotated_url, EXACTs[rrid], proper_citation] output_rows.append(row) DATE = date.today().strftime('%Y-%m-%d') return output_rows, DATE
def export_impl(): h = HypothesisUtils(username=username, token=api_token, group=group, max_results=100000) params = {'group' : h.group } rows = h.search_all(params) annos = [HypothesisAnnotation(row) for row in rows] annotated_urls = defaultdict(list) for anno in annos: annotated_urls[anno.uri].append(anno) output_rows = [] for annotated_url in annotated_urls.keys(): #print(annotated_url) annos = annotated_urls[annotated_url] replies = defaultdict(list) PMID = [] for anno in annos: # gotta build the reply structure and get pmid #print('id:', anno.id) #print('user:'******'exact:', anno.exact) #print('text:', anno.text) #print('tags:', anno.tags) #print('type:', anno.type) #print('references:', anno.references) if anno.references: for reference in anno.references: # shouldn't there only be one??? replies[reference].append(anno) PMID.extend([tag for tag in anno.tags if tag.startswith('PMID:') and '_' not in tag]) # bad tags with PMID:SCR_ #curators didn't put the pmid in as tags :( if anno.text.startswith('PMID:'): # DANGER ZONE if '_' in anno.text: print('PMIDS DONT HAVE UNDERSCORES PROBABLY CURATION BUG', anno.text) else: PMID.append(anno.text.strip()) # because, yep, when you don't tag sometimes you get \n :/ if PMID: if len(PMID) > 1: print(PMID, annotated_url) if PMID[0] == PMID[1]: PMID = PMID[0] print('WARNING: more than one pmid tag') else: raise BaseException('more than one pmid tag') else: PMID = PMID[0] #print(PMID) else: all_tags = [] for a in annos: all_tags.extend(a.tags) #print('NO PMID FOR', annotated_url) #print(set([a.user for a in annos])) #print(all_tags) PMID = annotated_url RRIDs = defaultdict(list) EXACTs = {} CITEs = {} #USERs = {} for anno in annos: RRID = None additional = [] for tag in anno.tags: if re.match('RRID:.+[0-9]+.+', tag): # ARRRRGGGGHHHHHHH ARRRRGGHHHH #if re.match('RRID:.+', tag): # ARRRRGGGGHHHHHHH ARRRRGGHHHH if RRID is not None: raise BaseException('MORE THAN ONE RRID PER ENTRY!') RRID = tag # :/ this works for now but ARHGHHGHASFHAS else: additional.append(tag) # eg Unresolved if tag == 'RRIDCUR:Missing': # fix for bad curation process maybe_rrid = anno.text.strip() if re.match('RRID:.+[0-9]+', maybe_rrid): # ARRRRGGGGHHHHHHH ARRRRGGHHHH RRID = maybe_rrid # RRIDCUR:Missing was already added above if RRID is not None: EXACTs[RRID] = anno.exact.strip() if anno.exact else '' RRIDs[RRID].extend(additional) #USERs[RRID] = anno.user if RRID not in CITEs: if anno.text: if 'Proper Citation:' in anno.text: CITEs[RRID] = anno.text.split('Proper Citation:')[1].strip().split('<',1)[0] if anno.id in replies: for r_anno in replies[anno.id]: RRIDs[RRID].extend(r_anno.tags) # not worrying about the text here elif not anno.references and PMID not in anno.tags: # this is an independent annotation which will not be included new = 'NONE:' + anno.id RRIDs[new].append('') EXACTs[new] = anno.exact #USERs[RRID] = anno.user for rrid, more in RRIDs.items(): #FIXME TOOOOOO SLOW #r = requests.get('https://scicrunch.org/resolver/{RRID}.xml'.format(RRID=rrid)) #if r.status_code < 300: #proper_citation = get_proper_citation(r.content) #else: #proper_citation = '' try: proper_citation = CITEs[rrid] except KeyError: # FIXME this is a hack to avoid some cases of LWW for citations proper_citation = '' if not more: row = [PMID, rrid, '', annotated_url, EXACTs[rrid], proper_citation] output_rows.append(row) else: for val in set(more): # cull dupes row = [PMID, rrid, val, annotated_url, EXACTs[rrid], proper_citation] output_rows.append(row) DATE = date.today().strftime('%Y-%m-%d') return output_rows, DATE
#!/usr/bin/env python import sys from hypothesis import HypothesisUtils, HypothesisAnnotation source_usernames = open('usernames.txt').read().split('\n') urls = open('urls.txt').read().split('\n') target_username = '' target_token = '' h = HypothesisUtils(target_username, target_token) def transfer(): """ given a set of urls and users, copy (public) annotations to another users's account """ for url in urls: for source_username in source_usernames: params = {'uri': url} rows = h.search_all(params) for row in list( rows ): # capture the original result set, else it'll keep growing as items are posted! anno = HypothesisAnnotation(row) if anno.user not in source_usernames: continue row['user'] = row['user'].replace(source_username, target_username) permissions = row['permissions'] permission_fields = ['admin', 'update', 'delete'] for field in permission_fields: permissions[field][0] = permissions[field][0].replace( source_username, target_username)
def rrid(request): """ Receive an article, parse RRIDs, resolve them, create annotations, log results """ if request.method == 'OPTIONS': response = Response() request_headers = request.headers[ 'Access-Control-Request-Headers'].lower() request_headers = re.findall('\w(?:[-\w]*\w)', request_headers) response_headers = ['access-control-allow-origin'] for req_acoa_header in request_headers: if req_acoa_header not in response_headers: response_headers.append(req_acoa_header) response_headers = ','.join(response_headers) response.headers.update({ 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Headers': '%s' % response_headers }) response.status_int = 204 return response h = HypothesisUtils(username=username, token=api_token, group=group) target_uri = urlparse.parse_qs(request.text)['uri'][0] params = {'limit': 200, 'uri': target_uri} query_url = h.query_url_template.format(query=urlencode(params, True)) obj = h.authenticated_api_query(query_url) rows = obj['rows'] tags = set() for row in rows: if row['group'] != h.group: # api query returns unwanted groups continue elif row['user'] != 'acct:' + h.username + '@hypothes.is': continue for tag in row['tags']: if tag.startswith('RRID'): tags.add(tag) html = urlparse.parse_qs(request.text)['data'][0] print(target_uri) found_rrids = {} try: matches = re.findall('(.{0,10})(RRID:\s*)([_\w\-:]+)([^\w].{0,10})', html.replace('–', '-')) existing = [] for match in matches: print(match) prefix = match[0] exact = match[2] if 'RRID:' + exact in tags: print('skipping %s, already annotated' % exact) continue new_tags = [] if exact in existing: new_tags.append('RRIDCUR:Duplicate') else: existing.append(exact) found_rrids[exact] = None suffix = match[3] print('\t' + exact) resolver_uri = 'https://scicrunch.org/resolver/%s.xml' % exact r = requests.get(resolver_uri) print(r.status_code) xml = r.content found_rrids[exact] = r.status_code if r.status_code < 300: root = etree.fromstring(xml) if root.findall('error'): s = 'Resolver lookup failed.' s += '<hr><p><a href="%s">resolver lookup</a></p>' % resolver_uri r = h.create_annotation_with_target_using_only_text_quote( url=target_uri, prefix=prefix, exact=exact, suffix=suffix, text=s, tags=new_tags + ['RRIDCUR:Unresolved']) print('ERROR') else: data_elements = root.findall('data')[0] s = '' data_elements = [ (e.find('name').text, e.find('value').text) for e in data_elements ] # these shouldn't duplicate citation = [(n, v) for n, v in data_elements if n == 'Proper Citation'] name = [(n, v) for n, v in data_elements if n == 'Name'] data_elements = citation + name + sorted( [(n, v) for n, v in data_elements if (n != 'Proper Citation' or n != 'Name') and v is not None]) for name, value in data_elements: if (name == 'Reference' or name == 'Mentioned In Literature' ) and value is not None and value.startswith( '<a class'): if len(value) > 500: continue # nif-0000-30467 fix keep those pubmed links short! s += '<p>%s: %s</p>' % (name, value) s += '<hr><p><a href="%s">resolver lookup</a></p>' % resolver_uri r = h.create_annotation_with_target_using_only_text_quote( url=target_uri, prefix=prefix, exact=exact, suffix=suffix, text=s, tags=new_tags + ['RRID:' + exact]) else: s = 'Resolver lookup failed.' r = h.create_annotation_with_target_using_only_text_quote( url=target_uri, prefix=prefix, exact=exact, suffix=suffix, text=s, tags=new_tags + ['RRIDCUR:Unresolved']) except: print(traceback.print_exc()) results = ', '.join(found_rrids.keys()) r = Response(results) r.content_type = 'text/plain' r.headers.update({'Access-Control-Allow-Origin': '*'}) try: now = datetime.now().isoformat()[0:19].replace(':', '').replace('-', '') fname = 'rrid-%s.log' % now s = 'URL: %s\n\nResults: %s\n\nCount: %s\n\nText:\n\n%s' % ( target_uri, results, len(found_rrids), html) with open(fname, 'wb') as f: f.write(s.encode('utf-8')) except: print(traceback.print_exc()) return r