Example #1
0
def create_annotations(article_uri, content):
    annotated_doc = AnnoDoc(content)
    for annotator in annotators:
        annotated_doc.add_tier(annotator)

    def get_span_uri(span):
        h = hashlib.md5()
        h.update(article_uri)
        h.update(str(span.start) + ':' + str(span.end))
        return "http://www.eha.io/types/annotation/annie/span/" + str(
            h.hexdigest())

    for tier_name in [
            'geonames', 'diseases', 'hosts', 'modes', 'pathogens', 'symptoms'
    ]:
        tier = annotated_doc.tiers[tier_name]
        update_query = make_template("""
        prefix anno: <http://www.eha.io/types/annotation_prop/>
        prefix eha: <http://www.eha.io/types/>
        prefix rdf: <http://www.w3.org/2000/01/rdf-schema#>
        prefix dc: <http://purl.org/dc/terms/>
        {% for span in spans %}
        INSERT DATA {
            <{{get_span_uri(span)}}> anno:annotator eha:annie
                {% if span.geoname %}
                    ; rdf:type eha:geoname_annotation
                    ; anno:geoname <http://sws.geonames.org/{{span.geoname.geonameid}}>
                {% else %}
                    ; rdf:type eha:keyword_annotation
                    ; anno:category "{{tier_name}}"
                {% endif %}
                ; anno:label "{{span.label | escape}}"
                ; anno:source_doc <{{source_doc}}>
                ; anno:start {{span.start}}
                ; anno:end {{span.end}}
                ; anno:selected-text "{{span.text | escape}}"
        } ;
        {% if tier_name == "diseases" %}
            INSERT DATA {
                {% for entity_uri in resolve_keyword(span.label) %}
                     <{{entity_uri}}> dc:relation <{{get_span_uri(span)}}> .
                {% endfor %}
            } ;
        {% endif %}
        {% endfor %}
        INSERT DATA {
            <{{source_doc}}> anno:annotated_by eha:annie_1
        }
        """).render(get_span_uri=get_span_uri,
                    resolve_keyword=resolve_keyword,
                    source_doc=article_uri,
                    tier_name=tier_name,
                    spans=tier.spans)
        sparql_utils.update(update_query)
Example #2
0
def create_annotations(article_uri, content):
    annotated_doc = AnnoDoc(content)
    for annotator in annotators:
        annotated_doc.add_tier(annotator)
    def get_span_uri(span):
        h = hashlib.md5()
        h.update(article_uri)
        h.update(str(span.start) + ':' + str(span.end))
        return "http://www.eha.io/types/annotation/annie/span/" + str(h.hexdigest())
    for tier_name in ['geonames', 'diseases', 'hosts', 'modes', 'pathogens', 'symptoms']:
        tier = annotated_doc.tiers[tier_name]
        update_query = make_template("""
        prefix anno: <http://www.eha.io/types/annotation_prop/>
        prefix eha: <http://www.eha.io/types/>
        prefix rdf: <http://www.w3.org/2000/01/rdf-schema#>
        prefix dc: <http://purl.org/dc/terms/>
        {% for span in spans %}
        INSERT DATA {
            <{{get_span_uri(span)}}> anno:annotator eha:annie
                {% if span.geoname %}
                    ; rdf:type eha:geoname_annotation
                    ; anno:geoname <http://sws.geonames.org/{{span.geoname.geonameid}}>
                {% else %}
                    ; rdf:type eha:keyword_annotation
                    ; anno:category "{{tier_name}}"
                {% endif %}
                ; anno:label "{{span.label | escape}}"
                ; anno:source_doc <{{source_doc}}>
                ; anno:start {{span.start}}
                ; anno:end {{span.end}}
                ; anno:selected-text "{{span.text | escape}}"
        } ;
        {% if tier_name == "diseases" %}
            INSERT DATA {
                {% for entity_uri in resolve_keyword(span.label) %}
                     <{{entity_uri}}> dc:relation <{{get_span_uri(span)}}> .
                {% endfor %}
            } ;
        {% endif %}
        {% endfor %}
        INSERT DATA {
            <{{source_doc}}> anno:annotated_by eha:annie_1
        }
        """).render(
            get_span_uri=get_span_uri,
            resolve_keyword=resolve_keyword,
            source_doc=article_uri,
            tier_name=tier_name,
            spans=tier.spans)
        sparql_utils.update(update_query)
Example #3
0
def resolve_keyword(keyword):
    query = make_template("""
    prefix anno: <http://www.eha.io/types/annotation_prop/>
    prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
    prefix obo: <http://purl.obolibrary.org/obo/>
    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?entity
    WHERE {
        BIND (obo:DOID_4 AS ?disease)
        ?entity rdfs:subClassOf* ?disease .
        ?entity oboInOwl:hasNarrowSynonym|oboInOwl:hasRelatedSynonym|oboInOwl:hasExactSynonym|rdfs:label ?label
        FILTER regex(?label, "^({{keyword | escape}})$", "i")
    }
    """).render(keyword=re.escape(keyword))
    resp = sparql_utils.query(query)
    bindings = resp.json()['results']['bindings']
    if len(bindings) == 0:
        print("no match for", keyword.encode('ascii', 'xmlcharrefreplace'))
    elif len(bindings) > 1:
        print("multiple matches for",
              keyword.encode('ascii', 'xmlcharrefreplace'))
        print(bindings)
    return [binding['entity']['value'] for binding in bindings]
Example #4
0
def print_result(result):
    for binding in result.json()['results']['bindings']:
        for key, value in binding.items():
            raw_val = value['value']
            print "[" + key + "]"
            # Check for the delimiter used to combine results in a "group by" query group.
            if ";;" in raw_val:
                print raw_val.split(";;")
                continue
            # If the value references an annotation, query it and display
            # the full text.
            if raw_val.startswith('http://www.eha.io/types/annotation/'):
                query = make_template("""
                prefix anno: <http://www.eha.io/types/annotation_prop/>
                prefix dep: <http://www.eha.io/types/annotation_prop/dep/>
                prefix con: <http://www.eha.io/types/content/>
                SELECT ?phraseStart ?phraseEnd ?prepStart ?sourceText
                WHERE {
                    <{{annotation_uri}}> anno:start ?phraseStart
                        ; anno:end ?phraseEnd
                        ; anno:source_doc/con:text ?sourceText
                }
                """).render(annotation_uri=raw_val)
                result = sparql_utils.query(query)
                bindings = result.json()['results']['bindings']
                if len(bindings) == 0:
                    print "Could not resolve source text for:"
                    print key, raw_val
                for binding in bindings:
                    text = binding['sourceText']['value']
                    start = int(binding['phraseStart']['value'])
                    end = int(binding['phraseEnd']['value'])
                    print text[start:end]
            else:
                print raw_val
            print ""
        print "~~--~~--~~"
Example #5
0
def print_result(result):
    for binding in result.json()['results']['bindings']:
        for key, value in binding.items():
            raw_val = value['value']
            print "[" + key + "]"
            # Check for the delimiter used to combine results in a "group by" query group.
            if ";;" in raw_val:
                print raw_val.split(";;")
                continue
            # If the value references an annotation, query it and display
            # the full text.
            if raw_val.startswith('http://www.eha.io/types/annotation/'):
                query = make_template("""
                prefix anno: <http://www.eha.io/types/annotation_prop/>
                prefix dep: <http://www.eha.io/types/annotation_prop/dep/>
                prefix con: <http://www.eha.io/types/content/>
                SELECT ?phraseStart ?phraseEnd ?prepStart ?sourceText
                WHERE {
                    <{{annotation_uri}}> anno:start ?phraseStart
                        ; anno:end ?phraseEnd
                        ; anno:source_doc/con:text ?sourceText
                }
                """).render(annotation_uri=raw_val)
                result = sparql_utils.query(query)
                bindings = result.json()['results']['bindings']
                if len(bindings) == 0:
                    print "Could not resolve source text for:"
                    print key, raw_val
                for binding in bindings:
                    text = binding['sourceText']['value']
                    start = int(binding['phraseStart']['value'])
                    end = int(binding['phraseEnd']['value'])
                    print text[start:end]
            else:
                print raw_val
            print ""
        print "~~--~~--~~"
Example #6
0
def resolve_keyword(keyword):
    query = make_template("""
    prefix anno: <http://www.eha.io/types/annotation_prop/>
    prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
    prefix obo: <http://purl.obolibrary.org/obo/>
    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?entity
    WHERE {
        BIND (obo:DOID_4 AS ?disease)
        ?entity rdfs:subClassOf* ?disease .
        ?entity oboInOwl:hasNarrowSynonym|oboInOwl:hasRelatedSynonym|oboInOwl:hasExactSynonym|rdfs:label ?label
        FILTER regex(?label, "^({{keyword | escape}})$", "i")
    }
    """).render(
        keyword=re.escape(keyword)
    )
    resp = sparql_utils.query(query)
    bindings = resp.json()['results']['bindings']
    if len(bindings) == 0:
        print("no match for", keyword.encode('ascii', 'xmlcharrefreplace'))
    elif len(bindings) > 1:
        print("multiple matches for", keyword.encode('ascii', 'xmlcharrefreplace'))
        print(bindings)
    return [binding['entity']['value'] for binding in bindings]
import json

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--mongo_url", default='localhost')
    parser.add_argument("--db_name", default='t11')
    args = parser.parse_args()
    db = pymongo.MongoClient(args.mongo_url)[args.db_name]
    for document in db.documents.find({}):
        uri = "http://t11.tater.io/documents/" + document['_id']
        update_query = make_template("""
        prefix xsd: <http://www.w3.org/2001/XMLSchema#>
        prefix tater: <http://www.eha.io/types/tater/>
        prefix con: <http://www.eha.io/types/content/>
        INSERT DATA {
            <{{uri}}> con:text "{{doc.body | escape}}"
            {% for key in ['title', 'createdAt'] %}
                ; tater:{{key}} {{doc[key] | sparqlCast}}
            {% endfor %}
            .
        }
        """).render(uri=uri, doc=document)
        sparql_utils.update(update_query)
        print("Imported " + uri)
    for code in db.keywords.find({}):
        uri = "http://t11.tater.io/codingKeywords/" + code['_id']
        update_query = make_template("""
        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        prefix tater: <http://www.eha.io/types/tater/>
        INSERT DATA {
            <{{uri}}> rdfs:label "{{code['label'] | escape}}"
                ; tater:header <{{header_uri}}>
Example #8
0
def create_annotations(article_uri, content):
    doc = spacy_parser(content)
    token_to_range = {}
    def update_range(r1, r2):
        if r1 is None:
            return r2
        if r2 is None:
            return r1
        return [min(r1[0], r2[0]), max(r1[1], r2[1])]
    for token in doc:
        child_token = token
        # import pdb; pdb.set_trace()
        while True:
            token_to_range[token] = update_range(
                token_to_range.get(token),
                update_range(
                    token_to_range.get(child_token),
                    [token.idx, token.idx + len(token.text)]))
            if token.dep_ != 'ROOT':
                child_token = token
                token = token.head
            else:
                break
    def get_token_uri(token):
        h = hashlib.md5()
        h.update(article_uri)
        h.update(str(token.idx))
        return "http://www.eha.io/types/annotation/spacy/" + str(h.hexdigest())
    def get_pharse_uri(token):
        h = hashlib.md5()
        h.update(article_uri)
        start, end = token_to_range[token]
        assert isinstance(start, int)
        assert isinstance(end, int)
        h.update(str(start) + ':' + str(end))
        return "http://www.eha.io/types/annotation/spacy/phrase/" + str(h.hexdigest())
    token_inserts = []
    for token in doc:
        token_inserts.append(make_template("""
        INSERT DATA {
            <{{pharse_ref}}> rdf:type eha:dependent_pharse
                ; anno:annotator eha:spacy
                ; anno:source_doc <{{source_doc}}>
                ; anno:start {{phrase_start}}
                ; anno:end {{phrase_end}}
                ; anno:selected-text "{{phrase_text | escape}}"
                ; anno:root <{{token_ref}}>
        } ;
        INSERT DATA {
            <{{token_ref}}> rdf:label "{{root_word | escape}}"
                ; anno:pos "{{pos}}"
                {% if entity_type %}
                    ; anno:entity_type "{{entity_type}}"
                {% endif %}
        } ;
        INSERT DATA {
            <{{parent_phrase_ref}}> dep:{{dep | replace_invalid_uri_chars}} <{{pharse_ref}}>
        }
        """).render(
            source_doc=article_uri,
            phrase_start=token_to_range[token][0],
            phrase_end=token_to_range[token][1],
            phrase_text=doc.text[slice(*token_to_range[token])],
            root_word=token.text,
            pos=token.pos_,
            entity_type=token.ent_type_,
            token_ref=get_token_uri(token),
            pharse_ref=get_pharse_uri(token),
            parent_phrase_ref=get_pharse_uri(token.head),
            dep=token.dep_))
    for chunk in more_itertools.chunked(token_inserts, 200):
        sparql_utils.update("""
        prefix anno: <http://www.eha.io/types/annotation_prop/>
        prefix dep: <http://www.eha.io/types/annotation_prop/dep/>
        prefix eha: <http://www.eha.io/types/>
        prefix rdf: <http://www.w3.org/2000/01/rdf-schema#>
        """ + ";".join(chunk))
    sparql_utils.update(make_template("""
    prefix anno: <http://www.eha.io/types/annotation_prop/>
    prefix eha: <http://www.eha.io/types/>
    INSERT DATA {
        <{{source_doc}}> anno:annotated_by eha:spacy_0
    }
    """).render(source_doc=article_uri))
Example #9
0
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--max_items", default="-1"
    )
    args = parser.parse_args()
    max_items = int(args.max_items)
    article_query_template = make_template("""
    prefix con: <http://www.eha.io/types/content/>
    prefix anno: <http://www.eha.io/types/annotation_prop/>
    prefix eha: <http://www.eha.io/types/>
    SELECT ?item_uri ?content
    WHERE {
        ?item_uri con:text ?content
        FILTER NOT EXISTS {
            ?item_uri anno:annotated_by eha:spacy_0
        }
    }
    ORDER BY asc(?item_uri)
    LIMIT 100
    """)
    items_processed = 0
    while max_items < 0 or items_processed < max_items:
        print("Items processed: ", str(items_processed))
        result = sparql_utils.query(article_query_template.render())
        bindings = result.json()['results']['bindings']
        if len(bindings) == 0:
            print("No more results")
            break
        else:
 if min_date:
     query["promedDate"] = {"$gte": min_date}
 print("Number of articles to process:")
 print(db.posts.find(query).count())
 for post in db.posts.find(query):
     # Create triples for post
     post_uri = "http://www.promedmail.org/post/" + post['promedId']
     update_query = make_template(prefixes + """
     INSERT DATA {
         <{{post_uri}}> pro:date "{{promedDate | sparqlDate}}"^^xsd:dateTime
             ; pro:subject_raw "{{subject.raw | escape}}"
             ; pro:archiveNumber "{{archiveNumber}}"
         {% for linkedReport in resolvedLinkedReports %}
             ; pro:linkedReport <{{linkedReport}}>
         {% endfor %}
         {% if feedId %}
             ; pro:feed_id "{{feedId}}"
         {% endif %}
     }
     """).render(min_date=min_date,
                 post_uri=post_uri,
                 resolvedLinkedReports=filter(
                     lambda x: x, map(resolve_report,
                                      post['linkedReports'])),
                 **post)
     sparql_utils.update(update_query)
     for idx, art in enumerate(post["articles"]):
         if not 'content' in art: continue
         # Create triples for article within the post
         article_uri = post_uri + "#" + str(idx)
         update_query = make_template(prefixes + """
         INSERT DATA {
Example #11
0
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--max_items", default="-1"
    )
    args = parser.parse_args()
    max_items = int(args.max_items)
    query_template = make_template("""
    prefix con: <http://www.eha.io/types/content/>
    prefix anno: <http://www.eha.io/types/annotation_prop/>
    prefix eha: <http://www.eha.io/types/>
    SELECT ?item_uri ?content
    WHERE {
        ?item_uri con:text ?content
        # FILTER(strstarts(str(?item_uri), "http://t11.tater.io/documents/"))
        FILTER NOT EXISTS {
            ?item_uri anno:annotated_by eha:annie_1
        }
    }
    ORDER BY rand()
    LIMIT 100
    """)
    items_processed = 0
    while max_items < 0 or items_processed < max_items:
        print("Items processed: ", str(items_processed))
        result = sparql_utils.query(query_template.render())
        bindings = result.json()['results']['bindings']
        if len(bindings) == 0:
            print("No more results")
            break
 )
 parser.add_argument(
     "--db_name", default='t11'
 )
 args = parser.parse_args()
 db = pymongo.MongoClient(args.mongo_url)[args.db_name]
 for document in db.documents.find({}):
     uri = "http://t11.tater.io/documents/" + document['_id']
     update_query = make_template("""
     prefix xsd: <http://www.w3.org/2001/XMLSchema#>
     prefix tater: <http://www.eha.io/types/tater/>
     prefix con: <http://www.eha.io/types/content/>
     INSERT DATA {
         <{{uri}}> con:text "{{doc.body | escape}}"
         {% for key in ['title', 'createdAt'] %}
             ; tater:{{key}} {{doc[key] | sparqlCast}}
         {% endfor %}
         .
     }
     """).render(
         uri=uri,
         doc=document
     )
     sparql_utils.update(update_query)
     print("Imported " + uri)
 for code in db.keywords.find({}):
     uri = "http://t11.tater.io/codingKeywords/" + code['_id']
     update_query = make_template("""
     prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
     prefix tater: <http://www.eha.io/types/tater/>
     INSERT DATA {
prefix xsd: <http://www.w3.org/2001/XMLSchema#>
"""

containment_query_template = make_template(prefixes+"""
INSERT { ?p1 anno:contains ?p2 }
WHERE {
    ?p1 anno:start ?p1start
    ; anno:end ?p1end
    ; anno:source_doc ?same_source
    .
    ?dep_rel rdf:type anno:dependency_relation .
    ?parent ?dep_rel ?p1 .
    ?p2 anno:start ?p2start
    ; anno:end ?p2end
    ; anno:source_doc ?same_source
    ; anno:category "diseases"
    .
    ?same_source pro:post/pro:date ?source_date .
    FILTER ( ?p1start <= ?p2start && ?p1end >= ?p2end )
    FILTER (?p1 != ?p2)
    {% if min_date %}
        FILTER (?source_date >= "{{min_date | sparqlDate}}"^^xsd:dateTime)
    {% endif %}
    {% if max_date %}
        FILTER (?source_date < "{{max_date | sparqlDate}}"^^xsd:dateTime)
    {% endif %}
}
""")

if __name__ == '__main__':
    import argparse
         "$gte": min_date
     }
 print("Number of articles to process:")
 print(db.posts.find(query).count())
 for post in db.posts.find(query):
     # Create triples for post
     post_uri = "http://www.promedmail.org/post/" + post['promedId']
     update_query = make_template(prefixes+"""
     INSERT DATA {
         <{{post_uri}}> pro:date "{{promedDate | sparqlDate}}"^^xsd:dateTime
             ; pro:subject_raw "{{subject.raw | escape}}"
             ; pro:archiveNumber "{{archiveNumber}}"
         {% for linkedReport in resolvedLinkedReports %}
             ; pro:linkedReport <{{linkedReport}}>
         {% endfor %}
         {% if feedId %}
             ; pro:feed_id "{{feedId}}"
         {% endif %}
     }
     """).render(
         min_date=min_date,
         post_uri=post_uri,
         resolvedLinkedReports=filter(lambda x:x, map(resolve_report, post['linkedReports'])),
         **post)
     sparql_utils.update(update_query)
     for idx, art in enumerate(post["articles"]):
         if not 'content' in art: continue
         # Create triples for article within the post
         article_uri = post_uri + "#" + str(idx)
         update_query = make_template(prefixes+"""
         INSERT DATA {
Example #15
0

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--max_items", default="-1")
    args = parser.parse_args()
    max_items = int(args.max_items)
    query_template = make_template("""
    prefix con: <http://www.eha.io/types/content/>
    prefix anno: <http://www.eha.io/types/annotation_prop/>
    prefix eha: <http://www.eha.io/types/>
    SELECT ?item_uri ?content
    WHERE {
        ?item_uri con:text ?content
        # FILTER(strstarts(str(?item_uri), "http://t11.tater.io/documents/"))
        FILTER NOT EXISTS {
            ?item_uri anno:annotated_by eha:annie_1
        }
    }
    ORDER BY rand()
    LIMIT 100
    """)
    items_processed = 0
    while max_items < 0 or items_processed < max_items:
        print("Items processed: ", str(items_processed))
        result = sparql_utils.query(query_template.render())
        bindings = result.json()['results']['bindings']
        if len(bindings) == 0:
            print("No more results")
            break
Example #16
0
prefix xsd: <http://www.w3.org/2001/XMLSchema#>
"""

containment_query_template = make_template(prefixes + """
INSERT { ?p1 anno:contains ?p2 }
WHERE {
    ?p1 anno:start ?p1start
    ; anno:end ?p1end
    ; anno:source_doc ?same_source
    .
    ?dep_rel rdf:type anno:dependency_relation .
    ?parent ?dep_rel ?p1 .
    ?p2 anno:start ?p2start
    ; anno:end ?p2end
    ; anno:source_doc ?same_source
    ; anno:category "diseases"
    .
    ?same_source pro:post/pro:date ?source_date .
    FILTER ( ?p1start <= ?p2start && ?p1end >= ?p2end )
    FILTER (?p1 != ?p2)
    {% if min_date %}
        FILTER (?source_date >= "{{min_date | sparqlDate}}"^^xsd:dateTime)
    {% endif %}
    {% if max_date %}
        FILTER (?source_date < "{{max_date | sparqlDate}}"^^xsd:dateTime)
    {% endif %}
}
""")

if __name__ == '__main__':
    import argparse