def test_saveStandoffFile_noArgNames(): text = "The colorectal cancer was caused by mutations in APC" e1 = kindred.Entity(entityType="disease",text="colorectal cancer",position=[(4, 21)],sourceEntityID="T1") e2 = kindred.Entity(entityType="gene",text="APC",position=[(49, 52)],sourceEntityID="T2") rel = kindred.Relation(relationType="causes",entityIDs=[e1.entityID,e2.entityID]) doc = kindred.Document(text,[e1,e2],[rel],relationsUseSourceIDs=False) corpus = kindred.Corpus() corpus.addDocument(doc) tempDir = tempfile.mkdtemp() kindred.save(corpus,'standoff',tempDir) loadedCorpus = kindred.loadDir('standoff',tempDir) assert isinstance(loadedCorpus,kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc,kindred.Document) entities = loadedDoc.getEntities() relations = loadedDoc.getRelations() sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['arg1','arg2'])], "(%s) not as expected" % relations shutil.rmtree(tempDir)
def test_entity_equals(): e1 = kindred.Entity(entityType="drug", text="Erlotinib", position=[(0, 9)], sourceEntityID=None) e2 = kindred.Entity(entityType="drug", text="Erlotinib", position=[(0, 9)], sourceEntityID="T16") e3 = kindred.Entity(entityType="drug", text="Erlotinib", position=[(0, 9)], sourceEntityID=None) rel1 = kindred.Relation(relationType="causes", entityIDs=[1, 2], argNames=None) assert e1 == e1 assert e1 != e2 assert e1 != e3 assert e2 == e2 assert e2 != e3 assert e1 != rel1 assert e2 != rel1 assert e3 != rel1
def test_document_init(): text = "Cancer is caused by mutations in ABCDE1." e1 = kindred.Entity('disease', 'Cancer', [(0, 6)], 'T1') e2 = kindred.Entity('gene', 'ABCDE1', [(33, 39)], 'T2') doc = kindred.Document(text, [e1, e2]) expected = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] []>" assert str(doc) == expected
def test_saveStandoffFile(): text = "The colorectal cancer was caused by mutations in APC" e1 = kindred.Entity(entityType="disease", text="colorectal cancer", position=[(4, 21)], sourceEntityID="T1") e2 = kindred.Entity(entityType="gene", text="APC", position=[(49, 52)], sourceEntityID="T2") rel = kindred.Relation(relationType="causes", entities=[e1, e2], argNames=['obj', 'subj']) doc = kindred.Document(text, [e1, e2], [rel]) corpus = kindred.Corpus() corpus.addDocument(doc) with TempDir() as tempDir: kindred.save(corpus, 'standoff', tempDir) for filename in os.listdir(tempDir): if filename.endswith('.a2'): checkRelationAnnotations(os.path.join(tempDir, filename)) loadedCorpus = kindred.load('standoff', tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc, kindred.Document) entities = loadedDoc.entities relations = loadedDoc.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['obj', 'subj'], sourceRelationID='R1') ], "(%s) not as expected" % relations
def test_document_init_withRel(): text = "Cancer is caused by mutations in ABCDE1." e1 = kindred.Entity('disease', 'Cancer', [(0, 6)], 'T1') e2 = kindred.Entity('gene', 'ABCDE1', [(33, 39)], 'T2') rel = kindred.Relation('causes', [e1, e2], ['subj', 'obj']) doc = kindred.Document(text, [e1, e2], [rel]) expected = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] [<Relation causes [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] ['subj', 'obj']>]>" assert str(doc) == expected
def test_relation_repr(): e1 = kindred.Entity('mutation','BRAF V600E mutation',[]) e2 = kindred.Entity('event','vemurafenib resistance',[]) rel1 = kindred.Relation(relationType="causes",entities=[e1,e2],argNames=None) rel2 = kindred.Relation(relationType="causes",entities=[e1,e2],argNames=["drug","disease"]) expected1 = "<Relation causes [<Entity mutation:'BRAF V600E mutation' sourceid=None []>, <Entity event:'vemurafenib resistance' sourceid=None []>] None>" expected2 = "<Relation causes [<Entity mutation:'BRAF V600E mutation' sourceid=None []>, <Entity event:'vemurafenib resistance' sourceid=None []>] ['drug', 'disease']>" assert rel1.__repr__() == expected1 assert rel2.__repr__() == expected2
def test_relation_repr(): e1 = kindred.Entity('mutation','BRAF V600E mutation',[]) e2 = kindred.Entity('event','vemurafenib resistance',[]) rel1 = kindred.CandidateRelation(entities=[e1,e2]) rel2 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[("causes",["drug","disease"])]) expected1 = "<CandidateRelation [<Entity mutation:'BRAF V600E mutation' sourceid=None []>, <Entity event:'vemurafenib resistance' sourceid=None []>] []>" expected2 = "<CandidateRelation [<Entity mutation:'BRAF V600E mutation' sourceid=None []>, <Entity event:'vemurafenib resistance' sourceid=None []>] [('causes', ['drug', 'disease'])]>" assert rel1.__repr__() == expected1 assert rel2.__repr__() == expected2
def test_relation_hash(): e1 = kindred.Entity('mutation','BRAF V600E mutation',[]) e2 = kindred.Entity('event','vemurafenib resistance',[]) rel1 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[]) rel2 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[]) rel3 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[("causes",["drug","disease"])]) rel4 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[("causes",["drug","disease"])]) assert hash(rel1) == hash(rel2) assert hash(rel3) == hash(rel4) assert hash(rel1) != hash(rel3)
def test_entity_str(): e1 = kindred.Entity(entityType="drug", text="Erlotinib", position=[(0, 9)], sourceEntityID=None) e2 = kindred.Entity(entityType="drug", text="Erlotinib", position=[(0, 9)], sourceEntityID="T16") expected1 = "<Entity drug:'Erlotinib' sourceid=None [(0, 9)]>" expected2 = "<Entity drug:'Erlotinib' sourceid=T16 [(0, 9)]>" assert str(e1) == expected1 assert str(e2) == expected2
def test_entity_repr(): e1 = kindred.Entity(entityType="drug", text="Erlotinib", position=[(0, 9)], sourceEntityID=None) e2 = kindred.Entity(entityType="drug", text="Erlotinib", position=[(0, 9)], sourceEntityID="T16") expected1 = "<Entity drug:'Erlotinib' id=%d sourceid=None [(0, 9)]>" % e1.entityID expected2 = "<Entity drug:'Erlotinib' id=%d sourceid=T16 [(0, 9)]>" % e2.entityID assert e1.__repr__() == expected1 assert e2.__repr__() == expected2
def test_sentence_addEntityWithLocations(capfd): text = 'lots of mutations cause dangerous cancer' tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()] s = kindred.Sentence(text, tokens, dependencies=[(2, 3, 'a'), (3, 5, 'b'), (4, 5, 'c')]) e1 = kindred.Entity('thingA', 'mutations', [(0, 1)]) e2 = kindred.Entity('thingB', 'cancer', [(0, 1)]) s.addEntityAnnotation(e1, [2]) s.addEntityAnnotation(e2, [5]) assert s.entityAnnotations == [(e1, [2]), (e2, [5])]
def test_relation_hash(): e1 = kindred.Entity('mutation','BRAF V600E mutation',[]) e2 = kindred.Entity('event','vemurafenib resistance',[]) rel1 = kindred.Relation(relationType="causes",entities=[e1,e2],argNames=None) rel2 = kindred.Relation(relationType="causes",entities=[e1,e2],argNames=None) rel3 = kindred.Relation(relationType="causes",entities=[e1,e2],argNames=["drug","disease"]) rel4 = kindred.Relation(relationType="causes",entities=[e1,e2],argNames=["drug","disease"]) assert hash(rel1) == hash(rel2) assert hash(rel3) == hash(rel4) assert hash(rel1) != hash(rel3) assert hash(rel1) == hash((rel1.relationType,tuple(rel1.entities),rel1.probability)) assert hash(rel3) == hash((rel3.relationType,tuple(rel3.entities),tuple(rel3.argNames),rel3.probability))
def test_sentence_str(capfd): text = 'lots of mutations cause dangerous cancer' tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()] e1 = kindred.Entity('thingA', 'mutations', [(0, 1)]) e2 = kindred.Entity('thingB', 'cancer', [(0, 1)]) entitiesWithLocations = [(e1, [2]), (e2, [5])] s = kindred.Sentence(text, tokens, dependencies=[(2, 3, 'a'), (3, 5, 'b'), (4, 5, 'c')], entitiesWithLocations=entitiesWithLocations) assert s.__repr__() == "lots of mutations cause dangerous cancer"
def annotateStarAlleles(corpus): for doc in corpus.documents: genes = [e for e in doc.entities if e.entityType == 'Gene'] for gene in genes: geneEnd = gene.position[0][1] geneID = gene.metadata['conceptid'] offset = geneEnd regex = '^(,|and|or|/|\s|\+)*(?P<main>\*\s*[0-9]([\w:]*\w+)?)' star = re.search(regex, doc.text[offset:]) while star: _, length = star.span() startPos, endPos = star.span('main') text = star.group('main') sourceEntityID = getNextSourceEntityID(doc) alleleName = text.strip()[1:].strip() conceptid = '*%s' % alleleName starAllele = kindred.Entity( 'Mutation', text, [(offset + startPos, offset + endPos)], sourceEntityID=sourceEntityID, metadata={ 'conceptid': conceptid, 'associated_gene': geneID }) doc.addEntity(starAllele) offset += length star = re.search(regex, doc.text[offset:])
def test_relation_equals(): rel1 = kindred.Relation(relationType="causes", entityIDs=[1, 2], argNames=None) rel2 = kindred.Relation(relationType="causes", entityIDs=[1, 2], argNames=None) rel3 = kindred.Relation(relationType="causes", entityIDs=[1, 2], argNames=["drug", "disease"]) rel4 = kindred.Relation(relationType="causes", entityIDs=[1, 2], argNames=["drug", "disease"]) e1 = kindred.Entity(entityType="drug", text="Erlotinib", position=[(0, 9)], sourceEntityID=None) assert rel1 == rel2 assert rel1 != rel3 assert rel1 != rel4 assert rel2 != rel3 assert rel3 == rel4 assert rel1 != e1 assert rel2 != e1 assert rel3 != e1 assert rel4 != e1
def test_entity_str_withExternalID(): e1 = kindred.Entity(entityType="drug", text="Erlotinib", position=[(0, 9)], sourceEntityID=None, externalID="id:1234") e2 = kindred.Entity(entityType="drug", text="Erlotinib", position=[(0, 9)], sourceEntityID="T16", externalID="id:9876") expected1 = "<Entity drug:'Erlotinib' id=%d sourceid=None externalid=id:1234 [(0, 9)]>" % e1.entityID expected2 = "<Entity drug:'Erlotinib' id=%d sourceid=T16 externalid=id:9876 [(0, 9)]>" % e2.entityID assert str(e1) == expected1 assert str(e2) == expected2
def convertBiocDocToKindredDocs(document): assert isinstance(document,bioc.BioCDocument) kindredDocs = [] for passage in document.passages: assert isinstance(passage,bioc.BioCPassage) text = passage.text offset = int(native(passage.offset)) entities = [] relations = [] for a in passage.annotations: assert isinstance(a,bioc.BioCAnnotation) entityType = a.infons['type'] sourceEntityID = a.id position = [] segments = [] for l in a.locations: assert isinstance(l,bioc.BioCLocation) startPos = int(native(l.offset)) - offset endPos = startPos + int(native(l.length)) position.append((startPos,endPos)) segments.append(text[startPos:endPos]) entityText = " ".join(segments) e = kindred.Entity(entityType,entityText,position,sourceEntityID) entities.append(e) sourceEntityIDToEntity = { entity.sourceEntityID:entity for entity in entities } for r in passage.relations: assert isinstance(r,bioc.BioCRelation) relationType = r.infons['type'] arguments = [] for n in r.nodes: assert isinstance(n,bioc.BioCNode) arguments.append((n.role,n.refid)) arguments = sorted(arguments) argNames = [ argName for argName,sourceEntityID in arguments] sourceEntityIDs = [ sourceEntityID for argName,sourceEntityID in arguments] for sourceEntityID in sourceEntityIDs: assert sourceEntityID in sourceEntityIDToEntity, "Relation references entity %s which does not exist in BioC document id=%s" % (sourceEntityID,str(document.id)) entities = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ] r = kindred.Relation(relationType,entities,argNames) relations.append(r) metadata = dict(document.infons) metadata.update(passage.infons) metadata['id'] = document.id relData = kindred.Document(text,entities=entities,relations=relations,metadata=metadata) kindredDocs.append(relData) return kindredDocs
def annotate(self, corpus): """ Annotate a parsed corpus with the wordlist lookup and other entity types :param corpus: Corpus to annotate :type corpus: kindred.Corpus """ assert corpus.parsed == True, "Corpus must already be parsed before entity recognition" for doc in corpus.documents: for sentence in doc.sentences: words = [t.word for t in sentence.tokens] extractedTermData = self._processWords(words) for locs, terms, termtypesAndids in extractedTermData: #text = " ".join(terms) startToken = locs[0] endToken = locs[1] startPos = sentence.tokens[startToken].startPos endPos = sentence.tokens[endToken - 1].endPos text = doc.text[startPos:endPos] loc = list(range(startToken, endToken)) for entityType, externalID in termtypesAndids: e = kindred.Entity(entityType, text, [(startPos, endPos)], externalID=externalID) doc.addEntity(e) sentence.addEntityWithLocation(e, loc)
def annotate(self, corpus): """ Annotate a parsed corpus with the wordlist lookup and other entity types :param corpus: Corpus to annotate :type corpus: kindred.Corpus """ assert corpus.parsed == True, "Corpus must already be parsed before entity recognition" for doc in corpus.documents: entityCount = len(doc.entities) for sentence in doc.sentences: extractedTermData = self._processWords(sentence) for locs, terms, termtypesAndids in extractedTermData: startToken = locs[0] endToken = locs[1] startPos = sentence.tokens[startToken].startPos endPos = sentence.tokens[endToken - 1].endPos text = doc.text[startPos:endPos] loc = list(range(startToken, endToken)) for entityType, externalID in termtypesAndids: sourceEntityID = "T%d" % (entityCount + 1) e = kindred.Entity(entityType, text, [(startPos, endPos)], externalID=externalID, sourceEntityID=sourceEntityID) #doc.addEntity(e) doc.entities.append(e) sentence.addEntityAnnotation(e, loc) entityCount += 1
def annotate(self, corpus): """ Annotate a corpus for numerical values :param corpus: Corpus to annotate :type corpus: kindred.Corpus """ assert corpus.parsed == True, "Corpus must already be parsed before entity recognition" for doc in corpus.documents: entityCount = len(doc.entities) for sentence in doc.sentences: words = [t.word for t in sentence.tokens] for i, t in enumerate(sentence.tokens): if not isNumber(t.word): continue sourceEntityID = "T%d" % (entityCount + 1) text = doc.text[t.startPos:t.endPos] loc = [i] e = kindred.Entity('quantity', text, [(t.startPos, t.endPos)], sourceEntityID=sourceEntityID) doc.addEntity(e) sentence.addEntityAnnotation(e, loc) entityCount += 1
def parseSimpleTag_helper(node,currentPosition=0,ignoreEntities=[]): text,entities,relationTuples = '',[],[] for s in node.childNodes: if s.nodeType == s.ELEMENT_NODE: insideText,insideEntities,insideRelationTuples = parseSimpleTag_helper(s,currentPosition+len(text)) if s.tagName == 'relation': relationType = s.getAttribute('type') arguments = [ (argName,entityID) for argName,entityID in s.attributes.items() if argName != 'type' ] arguments = sorted(arguments) sourceEntityIDs = [ sourceEntityID for argName,sourceEntityID in arguments] argNames = [ argName for argName,sourceEntityID in arguments] relationTuple = (relationType,sourceEntityIDs,argNames) relationTuples.append(relationTuple) else: # Entity entityType = s.tagName sourceEntityID = s.getAttribute('id') position = [(currentPosition+len(text),currentPosition+len(text)+len(insideText))] assert len(insideText) > 0, "Name (text inside tags) is empty for entity of type %s" % entityType if not entityType in ignoreEntities: e = kindred.Entity(entityType,insideText,position,sourceEntityID=sourceEntityID) entities.append(e) text += insideText entities += insideEntities relationTuples += insideRelationTuples elif s.nodeType == s.TEXT_NODE: text += s.nodeValue return text,entities,relationTuples
def loadEntity(filename,line,text): assert line[0] == 'T', "ERROR in %s. Entity input should start with a T" % filename split = line.strip().split('\t') assert len(split) == 3, "ERROR in %s" % filename entityID = split[0] typeInfo = split[1] tokens = split[2] textChunks = [] typeSpacePos = typeInfo.index(' ') typeName = typeInfo[:typeSpacePos] positionText = typeInfo[typeSpacePos:] positions = [] for coordinates in positionText.strip().split(';'): a,b = coordinates.strip().split(' ') a,b = int(a.strip()),int(b.strip()) textChunk = text[a:b].replace('\n',' ').strip() textChunks.append(textChunk) positions.append((a,b)) # Check that the tokens match up to the text chunkTest = " ".join(textChunks) tokensTest = tokens chunkTest = re.sub(r'\s\s+', ' ', chunkTest) tokensTest = re.sub(r'\s\s+', ' ', tokensTest) chunkTest = chunkTest.strip() tokensTest = tokensTest.strip() assert chunkTest == tokensTest , u"ERROR in " + filename + u"For id=" + entityID + ", tokens '" + tokens.encode('ascii', 'ignore') + "' don't match up with positions: " + str(positions) entity = kindred.Entity(typeName, tokensTest, positions, entityID) return entity
def test_document_addEntity(): text = "Cancer is caused by mutations in ABCDE1." doc = kindred.Document(text, []) e1 = kindred.Entity('disease', 'Cancer', [(0, 6)], 'T1') doc.addEntity(e1) expected = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>] []>" assert str(doc) == expected
def test_relation_triple_equals(): e1 = kindred.Entity('mutation','BRAF V600E mutation',[]) e2 = kindred.Entity('event','vemurafenib resistance',[]) e3 = kindred.Entity('citation','28028924',[]) rel1 = kindred.CandidateRelation(entities=[e1,e2,e3]) rel2 = kindred.CandidateRelation(entities=[e1,e2,e3]) rel3 = kindred.CandidateRelation(entities=[e1,e2,e3],knownTypesAndArgNames=[("causes",["whathappened","whatdiditcause","citation"])]) rel4 = kindred.CandidateRelation(entities=[e1,e2,e3],knownTypesAndArgNames=[("causes",["whathappened","whatdiditcause","citation"])]) assert rel1 == rel2 assert rel1 != rel3 assert rel1 != rel4 assert rel2 != rel3 assert rel3 == rel4 assert rel1 != e1 assert rel2 != e1 assert rel3 != e1 assert rel4 != e1
def test_relation_equals(): e1 = kindred.Entity('mutation','BRAF V600E mutation',[]) e2 = kindred.Entity('event','vemurafenib resistance',[]) rel1 = kindred.CandidateRelation(entities=[e1,e2]) rel2 = kindred.CandidateRelation(entities=[e1,e2]) rel3 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[("causes",["drug","disease"])]) rel4 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[("causes",["drug","disease"])]) e1 = kindred.Entity(entityType="drug",text="Erlotinib",position=[(0,9)],sourceEntityID=None) assert rel1 == rel2 assert rel1 != rel3 assert rel1 != rel4 assert rel2 != rel3 assert rel3 == rel4 assert rel1 != e1 assert rel2 != e1 assert rel3 != e1 assert rel4 != e1
def parseJSON(data, ignoreEntities=[]): entities = [] relations = [] text = data['text'] if 'denotations' in data: for d in data['denotations']: sourceEntityID = None if 'id' in d: sourceEntityID = d['id'] entityType = d['obj'] span = d['span'] startPos, endPos = span['begin'], span['end'] position = [(startPos, endPos)] entityText = text[startPos:endPos] if not entityType in ignoreEntities: entity = kindred.Entity(entityType, entityText, position, sourceEntityID=sourceEntityID) entities.append(entity) if 'relations' in data: for r in data['relations']: obj = r['obj'] relationType = r['pred'] subj = r['subj'] entityIDs = [obj, subj] argNames = ['obj', 'subj'] relation = kindred.Relation(relationType=relationType, entityIDs=entityIDs, argNames=argNames) relations.append(relation) expected = [ 'denotations', 'divid', 'modifications', 'namespaces', 'project', 'relations', 'sourcedb', 'sourceid', 'target', 'text', 'tracks' ] extraFields = [k for k in data.keys() if not k in expected] assert len(extraFields ) == 0, "Found additional unexpected fields (%s) in JSON" % ( ",".join(extraFields)) combinedData = kindred.Document(text, entities=entities, relations=relations) return combinedData
def parseJSON(data,ignoreEntities=[]): entities = [] relations = [] if isinstance(data,list): assert len(data) == 1 and isinstance(data[0],dict), "JSON loading expects a dictionary or a list with one dictionary in it" data = data[0] assert isinstance(data,dict), "JSON loading expects a dictionary or a list with one dictionary in it" text = data['text'] if 'denotations' in data: for d in data['denotations']: sourceEntityID = None if 'id' in d: sourceEntityID = d['id'] entityType = d['obj'] span = d['span'] startPos,endPos = span['begin'],span['end'] position = [(startPos,endPos)] entityText = text[startPos:endPos] if not entityType in ignoreEntities: entity = kindred.Entity(entityType,entityText,position,sourceEntityID=sourceEntityID) entities.append(entity) sourceEntityIDToEntity = { entity.sourceEntityID:entity for entity in entities } if 'relations' in data: for r in data['relations']: obj = r['obj'] relationType = r['pred'] subj = r['subj'] sourceEntityIDs = [obj,subj] argNames = ['obj','subj'] entitiesInRelation = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ] relation = kindred.Relation(relationType,entitiesInRelation,argNames) relations.append(relation) expected = ['denotations','divid','modifications','namespaces','project','relations','sourcedb','sourceid','target','text','tracks'] extraFields = [ k for k in data.keys() if not k in expected] assert len(extraFields) == 0, "Found additional unexpected fields (%s) in JSON" % (",".join(extraFields)) combinedData = kindred.Document(text,entities=entities,relations=relations) return combinedData
def splitIntoSentences(self): """ Create a new corpus with one document for each sentence in this document. :return: Corpus with one document per sentence :rtype: kindred.Corpus """ sentenceCorpus = kindred.Corpus() for sentence in self.sentences: sentenceStart = sentence.tokens[0].startPos entitiesInSentence = [ entity for entity, tokenIndices in sentence.entityAnnotations ] entityMap = OrderedDict() for e in entitiesInSentence: startPos, endPos = e.position[0] newPosition = [(startPos - sentenceStart, endPos - sentenceStart)] newE = kindred.Entity(e.entityType, e.text, newPosition, e.sourceEntityID, e.externalID) entityMap[e] = newE relationsInSentence = [ r for r in self.relations if all(e in entitiesInSentence for e in r.entities) ] newRelationsInSentence = [] for r in relationsInSentence: newEntitiesInRelation = [entityMap[e] for e in r.entities] newRelation = kindred.Relation(r.relationType, newEntitiesInRelation, r.argNames, r.probability) newRelationsInSentence.append(newRelation) newEntitiesInSentence = list(entityMap.values()) doc = kindred.Document(sentence.text, newEntitiesInSentence, newRelationsInSentence) newTokens = [ kindred.Token(t.word, t.lemma, t.partofspeech, t.startPos - sentenceStart, t.endPos - sentenceStart) for t in sentence.tokens ] newSentence = kindred.Sentence(sentence.text, newTokens, sentence.dependencies, sentence.sourceFilename) newEntityAnnotations = [ (entityMap[e], tokenIndices) for e, tokenIndices in sentence.entityAnnotations ] newSentence.entityAnnotations = newEntityAnnotations doc.sentences = [newSentence] sentenceCorpus.addDocument(doc) return sentenceCorpus
def convertBiocDocToKindredDocs(document): assert isinstance(document, bioc.BioCDocument) kindredDocs = [] for passage in document.passages: assert isinstance(passage, bioc.BioCPassage) text = passage.text offset = int(native(passage.offset)) entities = [] relations = [] for a in passage.annotations: assert isinstance(a, bioc.BioCAnnotation) entityType = a.infons['type'] sourceEntityID = a.id metadata = a.infons del metadata['type'] position = [] segments = [] for l in a.locations: assert isinstance(l, bioc.BioCLocation) startPos = int(native(l.offset)) - offset endPos = startPos + int(native(l.length)) assert startPos >= 0 and startPos <= len( text ) and endPos >= 0 and endPos <= len( text ), "Entity offsets (offset=%s,length=%s) are outside the span of the text (%s)" % ( str(l.offset), str(l.length), passage.text) position.append((startPos, endPos)) segments.append(text[startPos:endPos]) entityText = " ".join(segments) assert entityText == a.text, "Mismatch in entity annotation between expected text (%s) and extracted text (%s) using offset info for passage with text: %s" % ( a.text, entityText, text) e = kindred.Entity(entityType, entityText, position, sourceEntityID, metadata=metadata) entities.append(e) sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } for r in passage.relations: assert isinstance(r, bioc.BioCRelation) relationType = r.infons['type'] arguments = [] for n in r.nodes: assert isinstance(n, bioc.BioCNode) arguments.append((n.role, n.refid)) arguments = sorted(arguments) argNames = [argName for argName, sourceEntityID in arguments] sourceEntityIDs = [ sourceEntityID for argName, sourceEntityID in arguments ] for sourceEntityID in sourceEntityIDs: assert sourceEntityID in sourceEntityIDToEntity, "Relation references entity %s which does not exist in BioC document id=%s" % ( sourceEntityID, str(document.id)) entities = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ] r = kindred.Relation(relationType, entities, argNames) relations.append(r) metadata = dict(document.infons) metadata.update(passage.infons) metadata['id'] = document.id relData = kindred.Document(text, entities=entities, relations=relations, metadata=metadata) kindredDocs.append(relData) return kindredDocs
entityTypes = set([ entity.entityType for entity, tokenIndices in sentence.entityAnnotations ]) entityInfo = [(e.entityType, e.text) for e, tokenIndices in sentence.entityAnnotations ] hasMutation = "Mutation" in entityTypes hasChemical = "Chemical" in entityTypes if hasMutation and hasChemical: sentenceStart = sentence.tokens[0].startPos sentenceEntities = [ kindred.Entity(e.entityType, e.text, [(e.position[0][0] - sentenceStart, e.position[0][1] - sentenceStart)], e.sourceEntityID, e.externalID, metadata=e.metadata) for e, _ in sentence.entityAnnotations ] newDoc = kindred.Document(sentence.text, sentenceEntities, metadata=doc.metadata) sentenceCorpus.addDocument(newDoc) kindred.save(sentenceCorpus, 'biocxml', args.outBioc)