Ejemplo n.º 1
0
def export_json_impl():
    h = HypothesisUtils(username=username, token=api_token, group=group, max_results=100000)
    params = {'group' : h.group }
    rows = h.search_all(params)
    annos = [HypothesisAnnotation(row) for row in rows]

    # clean up bugs from old curation workflow
    for anno in annos:
        if anno.tags:
            new_tags = []
            for tag in anno.tags:
                if tag in bad_tags:
                    new_tags.append(tag.replace('RRID:', 'RRIDCUR:'))  # scibot made a mistake early, might be able to correct tags in bulk someday
                else:
                    new_tags.append(tag)  # horribly inefficient...
            anno.tags = new_tags

        if anno.text.startswith('RRID:'):  # catch cases where the RRID was put in text instead of in tags
            if 'RRIDCUR:Missing' in anno.tags or 'RRIDCUR:Unrecognized' in anno.tags:
                rtag = anno.text.split(None,1)[0]  # trap for cases where there is more text after an RRID...
                if rtag not in anno.tags:
                    anno.tags.append(rtag)
                    print('TEXT ISSUE for %s at https://hyp.is/%s' % (anno.user, anno.id))
        elif anno.exact and anno.exact.startswith('RRID:'):  # this needs to go second in case of RRIDCUR:Incorrect
            if anno.exact.startswith('RRID: '):  # deal with nospace first
                rtag = anno.exact.replace('RRID: ', 'RRID:')
            else:
                rtag = anno.exact
            rtag = rtag.split(None,1)[0]  # trap more
            if rtag not in anno.tags:
                if anno.user == 'scibot' and len(anno.tags) == 1 and anno.tags[0].startswith('RRID:RRID:'):  # FIXME HACK
                    anno.tags = [rtag]
                else:
                    pass  # anything else we detect in the data doesn't need to be corrected or used to fix tags

    output_json = [anno.__dict__ for anno in annos]
    DATE = date.today().strftime('%Y-%m-%d')
    return output_json, DATE
Ejemplo n.º 2
0
def rrid_wrapper(request, username, api_token, group, logloc):
    """ Receive an article, parse RRIDs, resolve them, create annotations, log results """
    if  request.method == 'OPTIONS':
        response = Response()
        request_headers = request.headers['Access-Control-Request-Headers'].lower()
        request_headers = re.findall('\w(?:[-\w]*\w)', request_headers)
        response_headers = ['access-control-allow-origin']
        for req_acoa_header in request_headers:
            if req_acoa_header not in response_headers:
                response_headers.append(req_acoa_header)
        response_headers = ','.join(response_headers)
        response.headers.update({
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Headers': '%s' % response_headers
            })
        response.status_int = 204
        return response

    h = HypothesisUtils(username=username, token=api_token, group=group)

    target_uri = urlparse.parse_qs(request.text)['uri'][0]
    params = { 'limit':200, 'uri':target_uri }
    query_url = h.query_url_template.format(query=urlencode(params, True))
    obj = h.authenticated_api_query(query_url)
    rows = obj['rows']
    tags = set()
    for row in rows:
        if row['group'] != h.group:  # api query returns unwanted groups
            continue
        elif row['user'] != 'acct:' + h.username + '@hypothes.is':
            continue
        for tag in row['tags']:
            if tag.startswith('RRID'):
                tags.add(tag)
    html = urlparse.parse_qs(request.text)['data'][0]
    print(target_uri)

    found_rrids = {}
    try:

        matches = re.findall('(.{0,10})(RRID(:|\)*,*)[ \t]*)(\w+[_\-:]+[\w\-]+)([^\w].{0,10})', html.replace('–','-'))
        existing = []
        for match in matches:
            print(match)
            prefix = match[0]
            exact = match[3]
            if 'RRID:'+exact in tags:
                print('skipping %s, already annotated' % exact)
                continue

            new_tags = []
            if exact in existing:
                new_tags.append('RRIDCUR:Duplicate')
            else:
                existing.append(exact)

            found_rrids[exact] = None
            suffix = match[4]
            print('\t' + exact)
            resolver_uri = 'https://scicrunch.org/resolver/%s.xml' % exact
            r = requests.get(resolver_uri)
            print(r.status_code)
            xml = r.content
            found_rrids[exact] = r.status_code
            if r.status_code < 300:
                root = etree.fromstring(xml)
                if root.findall('error'):
                    s = 'Resolver lookup failed.'
                    s += '<hr><p><a href="%s">resolver lookup</a></p>' % resolver_uri
                    r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, prefix=prefix, exact=exact, suffix=suffix, text=s, tags=new_tags + ['RRIDCUR:Unresolved'])
                    print('ERROR')
                else:
                    data_elements = root.findall('data')[0]
                    s = ''
                    data_elements = [(e.find('name').text, e.find('value').text) for e in data_elements]  # these shouldn't duplicate
                    citation = [(n, v) for n, v in  data_elements if n == 'Proper Citation']
                    name = [(n, v) for n, v in  data_elements if n == 'Name']
                    data_elements = citation + name + sorted([(n, v) for n, v in  data_elements if (n != 'Proper Citation' or n != 'Name') and v is not None])
                    for name, value in data_elements:
                        if (name == 'Reference' or name == 'Mentioned In Literature') and value is not None and value.startswith('<a class'):
                            if len(value) > 500:
                                continue  # nif-0000-30467 fix keep those pubmed links short!
                        s += '<p>%s: %s</p>' % (name, value)
                    s += '<hr><p><a href="%s">resolver lookup</a></p>' % resolver_uri
                    r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, prefix=prefix, exact=exact, suffix=suffix, text=s, tags=new_tags + ['RRID:'+exact])
            else:
                s = 'Resolver lookup failed.'
                r = h.create_annotation_with_target_using_only_text_quote(url=target_uri, prefix=prefix, exact=exact, suffix=suffix, text=s, tags=new_tags + ['RRIDCUR:Unresolved'])
    except:
        print(traceback.print_exc())

    results = ', '.join(found_rrids.keys())
    r = Response(results)
    r.content_type = 'text/plain'
    r.headers.update({
        'Access-Control-Allow-Origin': '*'
        })

    try:
        now = datetime.now().isoformat()[0:19].replace(':','').replace('-','')
        fname = logloc + 'rrid-%s.log' % now
        s = 'URL: %s\n\nResults: %s\n\nCount: %s\n\nText:\n\n%s' % ( target_uri, results, len(found_rrids), html ) 
        with open(fname, 'wb') as f:
            f.write(s.encode('utf-8'))
    except:
        print(traceback.print_exc())

    return r
Ejemplo n.º 3
0
Archivo: export.py Proyecto: ctb/rrid
#!/usr/bin/env python3
from __future__ import print_function
from os import environ
from collections import defaultdict
from hypothesis import HypothesisUtils, HypothesisAnnotation

username = environ.get('RRIDBOT_USERNAME', 'USERNAME')  # Hypothesis account
password = environ.get('RRIDBOT_PASSWORD', 'PASSWORD')
group = environ.get('RRIDBOT_GROUP', '__world__')
print(username, group)  # sanity check
    
h = HypothesisUtils(username=username, password=password, group=group, max_results=5000)
h.login()
params = {'group' : h.group }
rows = h.search_all(params)
annos = [HypothesisAnnotation(row) for row in rows]
annotated_urls = defaultdict(list)
for anno in annos:
    annotated_urls[anno.uri].append(anno)

html = """<html>
<head><style>
body { font-family:verdana;margin:.75in }
.anno { margin: 20px;
    border-style: solid;
    border-width: thin;
    padding: 20px; }
.text { margin:20px }
.article { font-size:larger }
</style></head>
<body>"""
Ejemplo n.º 4
0
def export_impl():
    h = HypothesisUtils(username=username, token=api_token, group=group, max_results=100000)
    params = {'group' : h.group }
    rows = h.search_all(params)
    annos = [HypothesisAnnotation(row) for row in rows]
    annotated_urls = defaultdict(list)
    for anno in annos:
        annotated_urls[anno.uri].append(anno)

    fix_trailing_slash(annotated_urls)

    output_rows = []
    for annotated_url in annotated_urls.keys():
        #print(annotated_url)
        annos = annotated_urls[annotated_url]
        replies = defaultdict(list)
        PMID = []
        for anno in annos:  # gotta build the reply structure and get pmid
            #print('id:', anno.id)
            #print('user:'******'exact:', anno.exact)
            #print('text:', anno.text)
            #print('tags:', anno.tags)
            #print('type:', anno.type)
            #print('references:', anno.references)
            if anno.references:
                for reference in anno.references:  # shouldn't there only be one???
                    replies[reference].append(anno)
            PMID.extend([tag for tag in anno.tags if tag.startswith('PMID:') and '_' not in tag])  # bad tags with PMID:SCR_
            #curators didn't put the pmid in as tags :(
            if anno.text.startswith('PMID:'):  # DANGER ZONE
                if '_' in anno.text:
                    print('PMIDS DONT HAVE UNDERSCORES PROBABLY CURATION BUG', anno.text)
                else:
                    PMID.append(anno.text.strip())  # because, yep, when you don't tag sometimes you get \n :/

        if PMID:
            if len(PMID) > 1:
                print(PMID, annotated_url)
                if PMID[0] == PMID[1]:
                    PMID = PMID[0]
                    print('WARNING: more than one pmid tag')
                else:
                    raise BaseException('more than one pmid tag')
            else:
                PMID = PMID[0]
                #print(PMID)
        else:
            all_tags = []
            for a in annos:
                all_tags.extend(a.tags)
            #print('NO PMID FOR', annotated_url)
            #print(set([a.user for a in annos]))
            #print(all_tags)
            PMID = annotated_url

        RRIDs = defaultdict(list)
        EXACTs = {}
        CITEs = {}
        #USERs = {}
        for anno in annos:
            RRID = None
            additional = []
            for tag in anno.tags:
                if re.match('RRID:.+[0-9]+.+', tag):  # ARRRRGGGGHHHHHHH ARRRRGGHHHH
                #if re.match('RRID:.+', tag):  # ARRRRGGGGHHHHHHH ARRRRGGHHHH
                    if RRID is not None:
                        raise BaseException('MORE THAN ONE RRID PER ENTRY!')
                    RRID = tag  # :/ this works for now but ARHGHHGHASFHAS
                else:
                    additional.append(tag)  # eg Unresolved

                if tag == 'RRIDCUR:Missing':  # fix for bad curation process
                    maybe_rrid = anno.text.strip()
                    if re.match('RRID:.+[0-9]+', maybe_rrid):  # ARRRRGGGGHHHHHHH ARRRRGGHHHH
                        RRID = maybe_rrid  # RRIDCUR:Missing was already added above

            if RRID is not None:
                EXACTs[RRID] = anno.exact.strip() if anno.exact else ''
                RRIDs[RRID].extend(additional)
                #USERs[RRID] = anno.user
                if RRID not in CITEs:
                    if anno.text:
                        if 'Proper Citation:' in anno.text:
                            CITEs[RRID] = anno.text.split('Proper Citation:')[1].strip().split('<',1)[0]

                if anno.id in replies:
                    for r_anno in replies[anno.id]:
                        RRIDs[RRID].extend(r_anno.tags)  # not worrying about the text here
            elif not anno.references and PMID not in anno.tags:  # this is an independent annotation which will not be included
                new = 'NONE:' + anno.id
                RRIDs[new].append('')
                EXACTs[new] = anno.exact
                #USERs[RRID] = anno.user

        for rrid, more in RRIDs.items():
            #FIXME TOOOOOO SLOW
            #r = requests.get('https://scicrunch.org/resolver/{RRID}.xml'.format(RRID=rrid))
            #if r.status_code < 300:
                #proper_citation = get_proper_citation(r.content)
            #else:
                #proper_citation = ''

            try:
                proper_citation = CITEs[rrid]
            except KeyError:  # FIXME this is a hack to avoid some cases of LWW for citations
                proper_citation = ''

            if not more:
                row = [PMID, rrid, '', annotated_url, EXACTs[rrid], proper_citation]
                output_rows.append(row)
            else:
                for val in set(more):  # cull dupes
                    row = [PMID, rrid, val, annotated_url, EXACTs[rrid], proper_citation]
                    output_rows.append(row)

    DATE = date.today().strftime('%Y-%m-%d')
    return output_rows, DATE
Ejemplo n.º 5
0
def export_impl():
    h = HypothesisUtils(username=username, token=api_token, group=group, max_results=100000)
    params = {'group' : h.group }
    rows = h.search_all(params)
    annos = [HypothesisAnnotation(row) for row in rows]
    annotated_urls = defaultdict(list)
    for anno in annos:
        annotated_urls[anno.uri].append(anno)

    output_rows = []
    for annotated_url in annotated_urls.keys():
        #print(annotated_url)
        annos = annotated_urls[annotated_url]
        replies = defaultdict(list)
        PMID = []
        for anno in annos:  # gotta build the reply structure and get pmid
            #print('id:', anno.id)
            #print('user:'******'exact:', anno.exact)
            #print('text:', anno.text)
            #print('tags:', anno.tags)
            #print('type:', anno.type)
            #print('references:', anno.references)
            if anno.references:
                for reference in anno.references:  # shouldn't there only be one???
                    replies[reference].append(anno)
            PMID.extend([tag for tag in anno.tags if tag.startswith('PMID:') and '_' not in tag])  # bad tags with PMID:SCR_
            #curators didn't put the pmid in as tags :(
            if anno.text.startswith('PMID:'):  # DANGER ZONE
                if '_' in anno.text:
                    print('PMIDS DONT HAVE UNDERSCORES PROBABLY CURATION BUG', anno.text)
                else:
                    PMID.append(anno.text.strip())  # because, yep, when you don't tag sometimes you get \n :/

        if PMID:
            if len(PMID) > 1:
                print(PMID, annotated_url)
                if PMID[0] == PMID[1]:
                    PMID = PMID[0]
                    print('WARNING: more than one pmid tag')
                else:
                    raise BaseException('more than one pmid tag')
            else:
                PMID = PMID[0]
                #print(PMID)
        else:
            all_tags = []
            for a in annos:
                all_tags.extend(a.tags)
            #print('NO PMID FOR', annotated_url)
            #print(set([a.user for a in annos]))
            #print(all_tags)
            PMID = annotated_url

        RRIDs = defaultdict(list)
        EXACTs = {}
        CITEs = {}
        #USERs = {}
        for anno in annos:
            RRID = None
            additional = []
            for tag in anno.tags:
                if re.match('RRID:.+[0-9]+.+', tag):  # ARRRRGGGGHHHHHHH ARRRRGGHHHH
                #if re.match('RRID:.+', tag):  # ARRRRGGGGHHHHHHH ARRRRGGHHHH
                    if RRID is not None:
                        raise BaseException('MORE THAN ONE RRID PER ENTRY!')
                    RRID = tag  # :/ this works for now but ARHGHHGHASFHAS
                else:
                    additional.append(tag)  # eg Unresolved

                if tag == 'RRIDCUR:Missing':  # fix for bad curation process
                    maybe_rrid = anno.text.strip()
                    if re.match('RRID:.+[0-9]+', maybe_rrid):  # ARRRRGGGGHHHHHHH ARRRRGGHHHH
                        RRID = maybe_rrid  # RRIDCUR:Missing was already added above

            if RRID is not None:
                EXACTs[RRID] = anno.exact.strip() if anno.exact else ''
                RRIDs[RRID].extend(additional)
                #USERs[RRID] = anno.user
                if RRID not in CITEs:
                    if anno.text:
                        if 'Proper Citation:' in anno.text:
                            CITEs[RRID] = anno.text.split('Proper Citation:')[1].strip().split('<',1)[0]

                if anno.id in replies:
                    for r_anno in replies[anno.id]:
                        RRIDs[RRID].extend(r_anno.tags)  # not worrying about the text here
            elif not anno.references and PMID not in anno.tags:  # this is an independent annotation which will not be included
                new = 'NONE:' + anno.id
                RRIDs[new].append('')
                EXACTs[new] = anno.exact
                #USERs[RRID] = anno.user

        for rrid, more in RRIDs.items():
            #FIXME TOOOOOO SLOW
            #r = requests.get('https://scicrunch.org/resolver/{RRID}.xml'.format(RRID=rrid))
            #if r.status_code < 300:
                #proper_citation = get_proper_citation(r.content)
            #else:
                #proper_citation = ''

            try:
                proper_citation = CITEs[rrid]
            except KeyError:  # FIXME this is a hack to avoid some cases of LWW for citations
                proper_citation = ''

            if not more:
                row = [PMID, rrid, '', annotated_url, EXACTs[rrid], proper_citation]
                output_rows.append(row)
            else:
                for val in set(more):  # cull dupes
                    row = [PMID, rrid, val, annotated_url, EXACTs[rrid], proper_citation]
                    output_rows.append(row)

    DATE = date.today().strftime('%Y-%m-%d')
    return output_rows, DATE
Ejemplo n.º 6
0
#!/usr/bin/env python
import sys
from hypothesis import HypothesisUtils, HypothesisAnnotation

source_usernames = open('usernames.txt').read().split('\n')
urls = open('urls.txt').read().split('\n')
target_username = ''
target_token = ''

h = HypothesisUtils(target_username, target_token)


def transfer():
    """ given a set of urls and users, copy (public) annotations to another users's account """
    for url in urls:
        for source_username in source_usernames:
            params = {'uri': url}
            rows = h.search_all(params)
            for row in list(
                    rows
            ):  # capture the original result set, else it'll keep growing as items are posted!
                anno = HypothesisAnnotation(row)
                if anno.user not in source_usernames:
                    continue
                row['user'] = row['user'].replace(source_username,
                                                  target_username)
                permissions = row['permissions']
                permission_fields = ['admin', 'update', 'delete']
                for field in permission_fields:
                    permissions[field][0] = permissions[field][0].replace(
                        source_username, target_username)
Ejemplo n.º 7
0
def rrid(request):
    """ Receive an article, parse RRIDs, resolve them, create annotations, log results """
    if request.method == 'OPTIONS':
        response = Response()
        request_headers = request.headers[
            'Access-Control-Request-Headers'].lower()
        request_headers = re.findall('\w(?:[-\w]*\w)', request_headers)
        response_headers = ['access-control-allow-origin']
        for req_acoa_header in request_headers:
            if req_acoa_header not in response_headers:
                response_headers.append(req_acoa_header)
        response_headers = ','.join(response_headers)
        response.headers.update({
            'Access-Control-Allow-Origin':
            '*',
            'Access-Control-Allow-Headers':
            '%s' % response_headers
        })
        response.status_int = 204
        return response

    h = HypothesisUtils(username=username, token=api_token, group=group)

    target_uri = urlparse.parse_qs(request.text)['uri'][0]
    params = {'limit': 200, 'uri': target_uri}
    query_url = h.query_url_template.format(query=urlencode(params, True))
    obj = h.authenticated_api_query(query_url)
    rows = obj['rows']
    tags = set()
    for row in rows:
        if row['group'] != h.group:  # api query returns unwanted groups
            continue
        elif row['user'] != 'acct:' + h.username + '@hypothes.is':
            continue
        for tag in row['tags']:
            if tag.startswith('RRID'):
                tags.add(tag)
    html = urlparse.parse_qs(request.text)['data'][0]
    print(target_uri)

    found_rrids = {}
    try:
        matches = re.findall('(.{0,10})(RRID:\s*)([_\w\-:]+)([^\w].{0,10})',
                             html.replace('–', '-'))
        existing = []
        for match in matches:
            print(match)
            prefix = match[0]
            exact = match[2]
            if 'RRID:' + exact in tags:
                print('skipping %s, already annotated' % exact)
                continue

            new_tags = []
            if exact in existing:
                new_tags.append('RRIDCUR:Duplicate')
            else:
                existing.append(exact)

            found_rrids[exact] = None
            suffix = match[3]
            print('\t' + exact)
            resolver_uri = 'https://scicrunch.org/resolver/%s.xml' % exact
            r = requests.get(resolver_uri)
            print(r.status_code)
            xml = r.content
            found_rrids[exact] = r.status_code
            if r.status_code < 300:
                root = etree.fromstring(xml)
                if root.findall('error'):
                    s = 'Resolver lookup failed.'
                    s += '<hr><p><a href="%s">resolver lookup</a></p>' % resolver_uri
                    r = h.create_annotation_with_target_using_only_text_quote(
                        url=target_uri,
                        prefix=prefix,
                        exact=exact,
                        suffix=suffix,
                        text=s,
                        tags=new_tags + ['RRIDCUR:Unresolved'])
                    print('ERROR')
                else:
                    data_elements = root.findall('data')[0]
                    s = ''
                    data_elements = [
                        (e.find('name').text, e.find('value').text)
                        for e in data_elements
                    ]  # these shouldn't duplicate
                    citation = [(n, v) for n, v in data_elements
                                if n == 'Proper Citation']
                    name = [(n, v) for n, v in data_elements if n == 'Name']
                    data_elements = citation + name + sorted(
                        [(n, v) for n, v in data_elements
                         if (n != 'Proper Citation' or n != 'Name')
                         and v is not None])
                    for name, value in data_elements:
                        if (name == 'Reference'
                                or name == 'Mentioned In Literature'
                            ) and value is not None and value.startswith(
                                '<a class'):
                            if len(value) > 500:
                                continue  # nif-0000-30467 fix keep those pubmed links short!
                        s += '<p>%s: %s</p>' % (name, value)
                    s += '<hr><p><a href="%s">resolver lookup</a></p>' % resolver_uri
                    r = h.create_annotation_with_target_using_only_text_quote(
                        url=target_uri,
                        prefix=prefix,
                        exact=exact,
                        suffix=suffix,
                        text=s,
                        tags=new_tags + ['RRID:' + exact])
            else:
                s = 'Resolver lookup failed.'
                r = h.create_annotation_with_target_using_only_text_quote(
                    url=target_uri,
                    prefix=prefix,
                    exact=exact,
                    suffix=suffix,
                    text=s,
                    tags=new_tags + ['RRIDCUR:Unresolved'])
    except:
        print(traceback.print_exc())

    results = ', '.join(found_rrids.keys())
    r = Response(results)
    r.content_type = 'text/plain'
    r.headers.update({'Access-Control-Allow-Origin': '*'})

    try:
        now = datetime.now().isoformat()[0:19].replace(':',
                                                       '').replace('-', '')
        fname = 'rrid-%s.log' % now
        s = 'URL: %s\n\nResults: %s\n\nCount: %s\n\nText:\n\n%s' % (
            target_uri, results, len(found_rrids), html)
        with open(fname, 'wb') as f:
            f.write(s.encode('utf-8'))
    except:
        print(traceback.print_exc())

    return r