def create_annotations(article_uri, content): annotated_doc = AnnoDoc(content) for annotator in annotators: annotated_doc.add_tier(annotator) def get_span_uri(span): h = hashlib.md5() h.update(article_uri) h.update(str(span.start) + ':' + str(span.end)) return "http://www.eha.io/types/annotation/annie/span/" + str( h.hexdigest()) for tier_name in [ 'geonames', 'diseases', 'hosts', 'modes', 'pathogens', 'symptoms' ]: tier = annotated_doc.tiers[tier_name] update_query = make_template(""" prefix anno: <http://www.eha.io/types/annotation_prop/> prefix eha: <http://www.eha.io/types/> prefix rdf: <http://www.w3.org/2000/01/rdf-schema#> prefix dc: <http://purl.org/dc/terms/> {% for span in spans %} INSERT DATA { <{{get_span_uri(span)}}> anno:annotator eha:annie {% if span.geoname %} ; rdf:type eha:geoname_annotation ; anno:geoname <http://sws.geonames.org/{{span.geoname.geonameid}}> {% else %} ; rdf:type eha:keyword_annotation ; anno:category "{{tier_name}}" {% endif %} ; anno:label "{{span.label | escape}}" ; anno:source_doc <{{source_doc}}> ; anno:start {{span.start}} ; anno:end {{span.end}} ; anno:selected-text "{{span.text | escape}}" } ; {% if tier_name == "diseases" %} INSERT DATA { {% for entity_uri in resolve_keyword(span.label) %} <{{entity_uri}}> dc:relation <{{get_span_uri(span)}}> . {% endfor %} } ; {% endif %} {% endfor %} INSERT DATA { <{{source_doc}}> anno:annotated_by eha:annie_1 } """).render(get_span_uri=get_span_uri, resolve_keyword=resolve_keyword, source_doc=article_uri, tier_name=tier_name, spans=tier.spans) sparql_utils.update(update_query)
def create_annotations(article_uri, content): annotated_doc = AnnoDoc(content) for annotator in annotators: annotated_doc.add_tier(annotator) def get_span_uri(span): h = hashlib.md5() h.update(article_uri) h.update(str(span.start) + ':' + str(span.end)) return "http://www.eha.io/types/annotation/annie/span/" + str(h.hexdigest()) for tier_name in ['geonames', 'diseases', 'hosts', 'modes', 'pathogens', 'symptoms']: tier = annotated_doc.tiers[tier_name] update_query = make_template(""" prefix anno: <http://www.eha.io/types/annotation_prop/> prefix eha: <http://www.eha.io/types/> prefix rdf: <http://www.w3.org/2000/01/rdf-schema#> prefix dc: <http://purl.org/dc/terms/> {% for span in spans %} INSERT DATA { <{{get_span_uri(span)}}> anno:annotator eha:annie {% if span.geoname %} ; rdf:type eha:geoname_annotation ; anno:geoname <http://sws.geonames.org/{{span.geoname.geonameid}}> {% else %} ; rdf:type eha:keyword_annotation ; anno:category "{{tier_name}}" {% endif %} ; anno:label "{{span.label | escape}}" ; anno:source_doc <{{source_doc}}> ; anno:start {{span.start}} ; anno:end {{span.end}} ; anno:selected-text "{{span.text | escape}}" } ; {% if tier_name == "diseases" %} INSERT DATA { {% for entity_uri in resolve_keyword(span.label) %} <{{entity_uri}}> dc:relation <{{get_span_uri(span)}}> . {% endfor %} } ; {% endif %} {% endfor %} INSERT DATA { <{{source_doc}}> anno:annotated_by eha:annie_1 } """).render( get_span_uri=get_span_uri, resolve_keyword=resolve_keyword, source_doc=article_uri, tier_name=tier_name, spans=tier.spans) sparql_utils.update(update_query)
def resolve_keyword(keyword): query = make_template(""" prefix anno: <http://www.eha.io/types/annotation_prop/> prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> prefix obo: <http://purl.obolibrary.org/obo/> prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?entity WHERE { BIND (obo:DOID_4 AS ?disease) ?entity rdfs:subClassOf* ?disease . ?entity oboInOwl:hasNarrowSynonym|oboInOwl:hasRelatedSynonym|oboInOwl:hasExactSynonym|rdfs:label ?label FILTER regex(?label, "^({{keyword | escape}})$", "i") } """).render(keyword=re.escape(keyword)) resp = sparql_utils.query(query) bindings = resp.json()['results']['bindings'] if len(bindings) == 0: print("no match for", keyword.encode('ascii', 'xmlcharrefreplace')) elif len(bindings) > 1: print("multiple matches for", keyword.encode('ascii', 'xmlcharrefreplace')) print(bindings) return [binding['entity']['value'] for binding in bindings]
def print_result(result): for binding in result.json()['results']['bindings']: for key, value in binding.items(): raw_val = value['value'] print "[" + key + "]" # Check for the delimiter used to combine results in a "group by" query group. if ";;" in raw_val: print raw_val.split(";;") continue # If the value references an annotation, query it and display # the full text. if raw_val.startswith('http://www.eha.io/types/annotation/'): query = make_template(""" prefix anno: <http://www.eha.io/types/annotation_prop/> prefix dep: <http://www.eha.io/types/annotation_prop/dep/> prefix con: <http://www.eha.io/types/content/> SELECT ?phraseStart ?phraseEnd ?prepStart ?sourceText WHERE { <{{annotation_uri}}> anno:start ?phraseStart ; anno:end ?phraseEnd ; anno:source_doc/con:text ?sourceText } """).render(annotation_uri=raw_val) result = sparql_utils.query(query) bindings = result.json()['results']['bindings'] if len(bindings) == 0: print "Could not resolve source text for:" print key, raw_val for binding in bindings: text = binding['sourceText']['value'] start = int(binding['phraseStart']['value']) end = int(binding['phraseEnd']['value']) print text[start:end] else: print raw_val print "" print "~~--~~--~~"
def resolve_keyword(keyword): query = make_template(""" prefix anno: <http://www.eha.io/types/annotation_prop/> prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> prefix obo: <http://purl.obolibrary.org/obo/> prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?entity WHERE { BIND (obo:DOID_4 AS ?disease) ?entity rdfs:subClassOf* ?disease . ?entity oboInOwl:hasNarrowSynonym|oboInOwl:hasRelatedSynonym|oboInOwl:hasExactSynonym|rdfs:label ?label FILTER regex(?label, "^({{keyword | escape}})$", "i") } """).render( keyword=re.escape(keyword) ) resp = sparql_utils.query(query) bindings = resp.json()['results']['bindings'] if len(bindings) == 0: print("no match for", keyword.encode('ascii', 'xmlcharrefreplace')) elif len(bindings) > 1: print("multiple matches for", keyword.encode('ascii', 'xmlcharrefreplace')) print(bindings) return [binding['entity']['value'] for binding in bindings]
import json if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--mongo_url", default='localhost') parser.add_argument("--db_name", default='t11') args = parser.parse_args() db = pymongo.MongoClient(args.mongo_url)[args.db_name] for document in db.documents.find({}): uri = "http://t11.tater.io/documents/" + document['_id'] update_query = make_template(""" prefix xsd: <http://www.w3.org/2001/XMLSchema#> prefix tater: <http://www.eha.io/types/tater/> prefix con: <http://www.eha.io/types/content/> INSERT DATA { <{{uri}}> con:text "{{doc.body | escape}}" {% for key in ['title', 'createdAt'] %} ; tater:{{key}} {{doc[key] | sparqlCast}} {% endfor %} . } """).render(uri=uri, doc=document) sparql_utils.update(update_query) print("Imported " + uri) for code in db.keywords.find({}): uri = "http://t11.tater.io/codingKeywords/" + code['_id'] update_query = make_template(""" prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> prefix tater: <http://www.eha.io/types/tater/> INSERT DATA { <{{uri}}> rdfs:label "{{code['label'] | escape}}" ; tater:header <{{header_uri}}>
def create_annotations(article_uri, content): doc = spacy_parser(content) token_to_range = {} def update_range(r1, r2): if r1 is None: return r2 if r2 is None: return r1 return [min(r1[0], r2[0]), max(r1[1], r2[1])] for token in doc: child_token = token # import pdb; pdb.set_trace() while True: token_to_range[token] = update_range( token_to_range.get(token), update_range( token_to_range.get(child_token), [token.idx, token.idx + len(token.text)])) if token.dep_ != 'ROOT': child_token = token token = token.head else: break def get_token_uri(token): h = hashlib.md5() h.update(article_uri) h.update(str(token.idx)) return "http://www.eha.io/types/annotation/spacy/" + str(h.hexdigest()) def get_pharse_uri(token): h = hashlib.md5() h.update(article_uri) start, end = token_to_range[token] assert isinstance(start, int) assert isinstance(end, int) h.update(str(start) + ':' + str(end)) return "http://www.eha.io/types/annotation/spacy/phrase/" + str(h.hexdigest()) token_inserts = [] for token in doc: token_inserts.append(make_template(""" INSERT DATA { <{{pharse_ref}}> rdf:type eha:dependent_pharse ; anno:annotator eha:spacy ; anno:source_doc <{{source_doc}}> ; anno:start {{phrase_start}} ; anno:end {{phrase_end}} ; anno:selected-text "{{phrase_text | escape}}" ; anno:root <{{token_ref}}> } ; INSERT DATA { <{{token_ref}}> rdf:label "{{root_word | escape}}" ; anno:pos "{{pos}}" {% if entity_type %} ; anno:entity_type "{{entity_type}}" {% endif %} } ; INSERT DATA { <{{parent_phrase_ref}}> dep:{{dep | replace_invalid_uri_chars}} <{{pharse_ref}}> } """).render( source_doc=article_uri, phrase_start=token_to_range[token][0], phrase_end=token_to_range[token][1], phrase_text=doc.text[slice(*token_to_range[token])], root_word=token.text, pos=token.pos_, entity_type=token.ent_type_, token_ref=get_token_uri(token), pharse_ref=get_pharse_uri(token), parent_phrase_ref=get_pharse_uri(token.head), dep=token.dep_)) for chunk in more_itertools.chunked(token_inserts, 200): sparql_utils.update(""" prefix anno: <http://www.eha.io/types/annotation_prop/> prefix dep: <http://www.eha.io/types/annotation_prop/dep/> prefix eha: <http://www.eha.io/types/> prefix rdf: <http://www.w3.org/2000/01/rdf-schema#> """ + ";".join(chunk)) sparql_utils.update(make_template(""" prefix anno: <http://www.eha.io/types/annotation_prop/> prefix eha: <http://www.eha.io/types/> INSERT DATA { <{{source_doc}}> anno:annotated_by eha:spacy_0 } """).render(source_doc=article_uri))
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument( "--max_items", default="-1" ) args = parser.parse_args() max_items = int(args.max_items) article_query_template = make_template(""" prefix con: <http://www.eha.io/types/content/> prefix anno: <http://www.eha.io/types/annotation_prop/> prefix eha: <http://www.eha.io/types/> SELECT ?item_uri ?content WHERE { ?item_uri con:text ?content FILTER NOT EXISTS { ?item_uri anno:annotated_by eha:spacy_0 } } ORDER BY asc(?item_uri) LIMIT 100 """) items_processed = 0 while max_items < 0 or items_processed < max_items: print("Items processed: ", str(items_processed)) result = sparql_utils.query(article_query_template.render()) bindings = result.json()['results']['bindings'] if len(bindings) == 0: print("No more results") break else:
if min_date: query["promedDate"] = {"$gte": min_date} print("Number of articles to process:") print(db.posts.find(query).count()) for post in db.posts.find(query): # Create triples for post post_uri = "http://www.promedmail.org/post/" + post['promedId'] update_query = make_template(prefixes + """ INSERT DATA { <{{post_uri}}> pro:date "{{promedDate | sparqlDate}}"^^xsd:dateTime ; pro:subject_raw "{{subject.raw | escape}}" ; pro:archiveNumber "{{archiveNumber}}" {% for linkedReport in resolvedLinkedReports %} ; pro:linkedReport <{{linkedReport}}> {% endfor %} {% if feedId %} ; pro:feed_id "{{feedId}}" {% endif %} } """).render(min_date=min_date, post_uri=post_uri, resolvedLinkedReports=filter( lambda x: x, map(resolve_report, post['linkedReports'])), **post) sparql_utils.update(update_query) for idx, art in enumerate(post["articles"]): if not 'content' in art: continue # Create triples for article within the post article_uri = post_uri + "#" + str(idx) update_query = make_template(prefixes + """ INSERT DATA {
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument( "--max_items", default="-1" ) args = parser.parse_args() max_items = int(args.max_items) query_template = make_template(""" prefix con: <http://www.eha.io/types/content/> prefix anno: <http://www.eha.io/types/annotation_prop/> prefix eha: <http://www.eha.io/types/> SELECT ?item_uri ?content WHERE { ?item_uri con:text ?content # FILTER(strstarts(str(?item_uri), "http://t11.tater.io/documents/")) FILTER NOT EXISTS { ?item_uri anno:annotated_by eha:annie_1 } } ORDER BY rand() LIMIT 100 """) items_processed = 0 while max_items < 0 or items_processed < max_items: print("Items processed: ", str(items_processed)) result = sparql_utils.query(query_template.render()) bindings = result.json()['results']['bindings'] if len(bindings) == 0: print("No more results") break
) parser.add_argument( "--db_name", default='t11' ) args = parser.parse_args() db = pymongo.MongoClient(args.mongo_url)[args.db_name] for document in db.documents.find({}): uri = "http://t11.tater.io/documents/" + document['_id'] update_query = make_template(""" prefix xsd: <http://www.w3.org/2001/XMLSchema#> prefix tater: <http://www.eha.io/types/tater/> prefix con: <http://www.eha.io/types/content/> INSERT DATA { <{{uri}}> con:text "{{doc.body | escape}}" {% for key in ['title', 'createdAt'] %} ; tater:{{key}} {{doc[key] | sparqlCast}} {% endfor %} . } """).render( uri=uri, doc=document ) sparql_utils.update(update_query) print("Imported " + uri) for code in db.keywords.find({}): uri = "http://t11.tater.io/codingKeywords/" + code['_id'] update_query = make_template(""" prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> prefix tater: <http://www.eha.io/types/tater/> INSERT DATA {
prefix xsd: <http://www.w3.org/2001/XMLSchema#> """ containment_query_template = make_template(prefixes+""" INSERT { ?p1 anno:contains ?p2 } WHERE { ?p1 anno:start ?p1start ; anno:end ?p1end ; anno:source_doc ?same_source . ?dep_rel rdf:type anno:dependency_relation . ?parent ?dep_rel ?p1 . ?p2 anno:start ?p2start ; anno:end ?p2end ; anno:source_doc ?same_source ; anno:category "diseases" . ?same_source pro:post/pro:date ?source_date . FILTER ( ?p1start <= ?p2start && ?p1end >= ?p2end ) FILTER (?p1 != ?p2) {% if min_date %} FILTER (?source_date >= "{{min_date | sparqlDate}}"^^xsd:dateTime) {% endif %} {% if max_date %} FILTER (?source_date < "{{max_date | sparqlDate}}"^^xsd:dateTime) {% endif %} } """) if __name__ == '__main__': import argparse
"$gte": min_date } print("Number of articles to process:") print(db.posts.find(query).count()) for post in db.posts.find(query): # Create triples for post post_uri = "http://www.promedmail.org/post/" + post['promedId'] update_query = make_template(prefixes+""" INSERT DATA { <{{post_uri}}> pro:date "{{promedDate | sparqlDate}}"^^xsd:dateTime ; pro:subject_raw "{{subject.raw | escape}}" ; pro:archiveNumber "{{archiveNumber}}" {% for linkedReport in resolvedLinkedReports %} ; pro:linkedReport <{{linkedReport}}> {% endfor %} {% if feedId %} ; pro:feed_id "{{feedId}}" {% endif %} } """).render( min_date=min_date, post_uri=post_uri, resolvedLinkedReports=filter(lambda x:x, map(resolve_report, post['linkedReports'])), **post) sparql_utils.update(update_query) for idx, art in enumerate(post["articles"]): if not 'content' in art: continue # Create triples for article within the post article_uri = post_uri + "#" + str(idx) update_query = make_template(prefixes+""" INSERT DATA {
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument("--max_items", default="-1") args = parser.parse_args() max_items = int(args.max_items) query_template = make_template(""" prefix con: <http://www.eha.io/types/content/> prefix anno: <http://www.eha.io/types/annotation_prop/> prefix eha: <http://www.eha.io/types/> SELECT ?item_uri ?content WHERE { ?item_uri con:text ?content # FILTER(strstarts(str(?item_uri), "http://t11.tater.io/documents/")) FILTER NOT EXISTS { ?item_uri anno:annotated_by eha:annie_1 } } ORDER BY rand() LIMIT 100 """) items_processed = 0 while max_items < 0 or items_processed < max_items: print("Items processed: ", str(items_processed)) result = sparql_utils.query(query_template.render()) bindings = result.json()['results']['bindings'] if len(bindings) == 0: print("No more results") break
prefix xsd: <http://www.w3.org/2001/XMLSchema#> """ containment_query_template = make_template(prefixes + """ INSERT { ?p1 anno:contains ?p2 } WHERE { ?p1 anno:start ?p1start ; anno:end ?p1end ; anno:source_doc ?same_source . ?dep_rel rdf:type anno:dependency_relation . ?parent ?dep_rel ?p1 . ?p2 anno:start ?p2start ; anno:end ?p2end ; anno:source_doc ?same_source ; anno:category "diseases" . ?same_source pro:post/pro:date ?source_date . FILTER ( ?p1start <= ?p2start && ?p1end >= ?p2end ) FILTER (?p1 != ?p2) {% if min_date %} FILTER (?source_date >= "{{min_date | sparqlDate}}"^^xsd:dateTime) {% endif %} {% if max_date %} FILTER (?source_date < "{{max_date | sparqlDate}}"^^xsd:dateTime) {% endif %} } """) if __name__ == '__main__': import argparse