def test_document_str(): doc1 = kindred.Document('<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.',loadFromSimpleTag=True) expected1 = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] []>" assert str(doc1) == expected1 assert doc1.__repr__() == expected1 doc2 = kindred.Document('<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.<relation type="causes" subj="T2" obj="T1" />',loadFromSimpleTag=True) expected2 = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] [<Relation causes [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] ['obj', 'subj']>]>" assert str(doc2) == expected2 assert doc2.__repr__() == expected2
def test_convertedTaggedTextWithRelations(): text = '<drug id="5">Erlotinib</drug> is a common treatment for <cancer id="6">NSCLC</cancer><relation type="treats" subj="5" obj="6" />' converted = kindred.Document(text, loadFromSimpleTag=True) assert isinstance(converted, kindred.Document) entities = converted.getEntities() assert isinstance(entities, list) for e in entities: assert isinstance(e, kindred.Entity) assertEntity(entities[0], expectedType='drug', expectedText='Erlotinib', expectedPos=[(0, 9)], expectedSourceEntityID='5') assertEntity(entities[1], expectedType='cancer', expectedText='NSCLC', expectedPos=[(36, 41)], expectedSourceEntityID='6') text = converted.getText() #assert isinstance(text,unicode) # Python3 issue here assert text == u"Erlotinib is a common treatment for NSCLC" sourceEntityIDsToEntityIDs = converted.getSourceEntityIDsToEntityIDs() assert converted.getRelations() == [ kindred.Relation( 'treats', [sourceEntityIDsToEntityIDs['6'], sourceEntityIDsToEntityIDs['5']], ['obj', 'subj']) ]
def test_document_entityIDs(): doc = kindred.Document( '<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.' ) expected = [e.entityID for e in doc.entities] assert doc.getEntityIDs() == expected
def test_saveStandoffFile_SeparateSentences(): texts = ['The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />','<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />'] corpus = kindred.Corpus() for t in texts: doc = kindred.Document(t) corpus.addDocument(doc) tempDir = tempfile.mkdtemp() kindred.save(corpus,'standoff',tempDir) loadedCorpus = kindred.loadDir('standoff',tempDir) assert isinstance(loadedCorpus,kindred.Corpus) assert len(loadedCorpus.documents) == 2 data = loadedCorpus.documents[0] assert isinstance(data,kindred.Document) entities = data.getEntities() relations = data.getRelations() sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations data = loadedCorpus.documents[1] assert isinstance(data,kindred.Document) entities = data.getEntities() relations = data.getRelations() sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='Li-Fraumeni',expectedPos=[(0,11)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='P53',expectedPos=[(39,42)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations shutil.rmtree(tempDir)
def test_document_entityIDToEntity(): doc = kindred.Document( '<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.', loadFromSimpleTag=True) expected = {e.entityID: e for e in doc.entities} assert doc.getEntityIDsToEntities() == expected
def test_convertTaggedTextWithSplitEntities(): #text = 'The <drug><disease>Erlotinib</disease></drug> is a common treatment for <cancer>NSCLC</cancer> patients' text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">lung</cancer> and unknown <cancer id="2">cancers</cancer>' converted = kindred.Document(text, loadFromSimpleTag=True) assert isinstance(converted, kindred.Document) entities = converted.getEntities() assert isinstance(entities, list) for e in entities: assert isinstance(e, kindred.Entity) assertEntity(entities[0], expectedType='drug', expectedText='Erlotinib', expectedPos=[(0, 9)], expectedSourceEntityID='1') assertEntity(entities[1], expectedType='cancer', expectedText='lung cancers', expectedPos=[(36, 40), (53, 60)], expectedSourceEntityID='2') text = converted.getText() #assert isinstance(text,unicode) # Python3 issue here assert text == "Erlotinib is a common treatment for lung and unknown cancers"
def test_convertTaggedText(): #text = 'The <drug><disease>Erlotinib</disease></drug> is a common treatment for <cancer>NSCLC</cancer> patients' text = "<drug>Erlotinib</drug> is a common treatment for <cancer>NSCLC</cancer>" converted = kindred.Document(text, loadFromSimpleTag=True) assert isinstance(converted, kindred.Document) entities = converted.getEntities() assert isinstance(entities, list) for e in entities: assert isinstance(e, kindred.Entity) assertEntity(entities[0], expectedType='drug', expectedText='Erlotinib', expectedPos=[(0, 9)], expectedSourceEntityID=1) assertEntity(entities[1], expectedType='cancer', expectedText='NSCLC', expectedPos=[(36, 41)], expectedSourceEntityID=2) text = converted.getText() #assert isinstance(text,unicode) # Python3 issue here assert text == "Erlotinib is a common treatment for NSCLC"
def convertBiocDocToKindredDocs(document): assert isinstance(document,bioc.BioCDocument) kindredDocs = [] for passage in document.passages: assert isinstance(passage,bioc.BioCPassage) text = passage.text offset = int(native(passage.offset)) entities = [] relations = [] for a in passage.annotations: assert isinstance(a,bioc.BioCAnnotation) entityType = a.infons['type'] sourceEntityID = a.id position = [] segments = [] for l in a.locations: assert isinstance(l,bioc.BioCLocation) startPos = int(native(l.offset)) - offset endPos = startPos + int(native(l.length)) position.append((startPos,endPos)) segments.append(text[startPos:endPos]) entityText = " ".join(segments) e = kindred.Entity(entityType,entityText,position,sourceEntityID) entities.append(e) sourceEntityIDToEntity = { entity.sourceEntityID:entity for entity in entities } for r in passage.relations: assert isinstance(r,bioc.BioCRelation) relationType = r.infons['type'] arguments = [] for n in r.nodes: assert isinstance(n,bioc.BioCNode) arguments.append((n.role,n.refid)) arguments = sorted(arguments) argNames = [ argName for argName,sourceEntityID in arguments] sourceEntityIDs = [ sourceEntityID for argName,sourceEntityID in arguments] for sourceEntityID in sourceEntityIDs: assert sourceEntityID in sourceEntityIDToEntity, "Relation references entity %s which does not exist in BioC document id=%s" % (sourceEntityID,str(document.id)) entities = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ] r = kindred.Relation(relationType,entities,argNames) relations.append(r) metadata = dict(document.infons) metadata.update(passage.infons) metadata['id'] = document.id relData = kindred.Document(text,entities=entities,relations=relations,metadata=metadata) kindredDocs.append(relData) return kindredDocs
def test_saveStandoffFile_fromSimpleTag(): text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />' corpus = kindred.Corpus() doc = kindred.Document(text) corpus.addDocument(doc) tempDir = tempfile.mkdtemp() kindred.save(corpus,'standoff',tempDir) loadedCorpus = kindred.loadDir('standoff',tempDir) assert isinstance(loadedCorpus,kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc,kindred.Document) entities = loadedDoc.getEntities() relations = loadedDoc.getRelations() sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations shutil.rmtree(tempDir)
def test_corpus_nfold_split(): mainCorpus = kindred.Corpus() docCount = 100 for i in range(docCount): doc = kindred.Document(text=str(i), entities=[]) mainCorpus.addDocument(doc) corpusA, corpusB = mainCorpus.split(0.75) folds = 5 trainCounter, testCounter = Counter(), Counter() for trainCorpus, testCorpus in mainCorpus.nfold_split(folds): assert len(trainCorpus.documents) == (folds - 1) * docCount / folds assert len(testCorpus.documents) == docCount / folds seen = set() for doc in corpusA.documents: assert doc in mainCorpus.documents, "This document doesn't match an existing one" assert not doc in seen, "This document isn't unique now" trainCounter[doc] += 1 for doc in corpusB.documents: assert doc in mainCorpus.documents, "This document doesn't match an existing one" assert not doc in seen, "This document isn't unique now" testCounter[doc] += 1 for doc, count in trainCounter.items(): assert count == folds for doc, count in testCounter.items(): assert count == folds
def parseSimpleTag(text,ignoreEntities=[]): docText = u"<doc>%s</doc>" % text xmldoc = minidom.parseString(docText.encode('utf8')) docNode = xmldoc.childNodes[0] text,unmergedEntities,relationTuples = parseSimpleTag_helper(docNode,ignoreEntities=ignoreEntities) missingSourceEntityID = [ e.sourceEntityID == '' for e in unmergedEntities ] assert all(missingSourceEntityID) or (not any(missingSourceEntityID)), 'All entities or none (not some) should be given IDs' assert (not any(missingSourceEntityID)) or len(relationTuples) == 0, "Cannot include relations with no-ID entities" if all(missingSourceEntityID): for i,e in enumerate(unmergedEntities): e.sourceEntityID = i+1 entities = mergeEntitiesWithMatchingIDs(unmergedEntities) sourceEntityIDToEntity = { entity.sourceEntityID:entity for entity in entities } relations = [] for relationType,sourceEntityIDs,argNames in relationTuples: assert len(sourceEntityIDs) == len(argNames) entitiesInRelation = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ] relation = kindred.Relation(relationType=relationType,entities=entitiesInRelation,argNames=argNames) relations.append(relation) combinedData = kindred.Document(text,entities=entities,relations=relations) return combinedData
def test_document_entitySourceIDToEntityID(): doc = kindred.Document( '<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.' ) expected = {e.sourceEntityID: e.entityID for e in doc.entities} assert doc.getSourceEntityIDsToEntityIDs() == expected
def test_saveStandoffFile_noArgNames(): text = "The colorectal cancer was caused by mutations in APC" e1 = kindred.Entity(entityType="disease",text="colorectal cancer",position=[(4, 21)],sourceEntityID="T1") e2 = kindred.Entity(entityType="gene",text="APC",position=[(49, 52)],sourceEntityID="T2") rel = kindred.Relation(relationType="causes",entityIDs=[e1.entityID,e2.entityID]) doc = kindred.Document(text,[e1,e2],[rel],relationsUseSourceIDs=False) corpus = kindred.Corpus() corpus.addDocument(doc) tempDir = tempfile.mkdtemp() kindred.save(corpus,'standoff',tempDir) loadedCorpus = kindred.loadDir('standoff',tempDir) assert isinstance(loadedCorpus,kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc,kindred.Document) entities = loadedDoc.getEntities() relations = loadedDoc.getRelations() sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['arg1','arg2'])], "(%s) not as expected" % relations shutil.rmtree(tempDir)
def test_document_entityTypeMap(): doc = kindred.Document( '<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.' ) mapping = doc.getSourceEntityIDsToEntityIDs() expected = {mapping["T1"]: 'disease', mapping["T2"]: 'gene'} assert doc.getEntityIDsToEntityTypes() == expected
def test_document_init(): text = "Cancer is caused by mutations in ABCDE1." e1 = kindred.Entity('disease', 'Cancer', [(0, 6)], 'T1') e2 = kindred.Entity('gene', 'ABCDE1', [(33, 39)], 'T2') doc = kindred.Document(text, [e1, e2]) expected = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] []>" assert str(doc) == expected
def test_saveStandoffFile(): text = "The colorectal cancer was caused by mutations in APC" e1 = kindred.Entity(entityType="disease", text="colorectal cancer", position=[(4, 21)], sourceEntityID="T1") e2 = kindred.Entity(entityType="gene", text="APC", position=[(49, 52)], sourceEntityID="T2") rel = kindred.Relation(relationType="causes", entities=[e1, e2], argNames=['obj', 'subj']) doc = kindred.Document(text, [e1, e2], [rel]) corpus = kindred.Corpus() corpus.addDocument(doc) with TempDir() as tempDir: kindred.save(corpus, 'standoff', tempDir) for filename in os.listdir(tempDir): if filename.endswith('.a2'): checkRelationAnnotations(os.path.join(tempDir, filename)) loadedCorpus = kindred.load('standoff', tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc, kindred.Document) entities = loadedDoc.entities relations = loadedDoc.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['obj', 'subj'], sourceRelationID='R1') ], "(%s) not as expected" % relations
def test_document_addEntity(): text = "Cancer is caused by mutations in ABCDE1." doc = kindred.Document(text, []) e1 = kindred.Entity('disease', 'Cancer', [(0, 6)], 'T1') doc.addEntity(e1) expected = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>] []>" assert str(doc) == expected
def test_document_init_withRel(): text = "Cancer is caused by mutations in ABCDE1." e1 = kindred.Entity('disease', 'Cancer', [(0, 6)], 'T1') e2 = kindred.Entity('gene', 'ABCDE1', [(33, 39)], 'T2') rel = kindred.Relation('causes', [e1, e2], ['subj', 'obj']) doc = kindred.Document(text, [e1, e2], [rel]) expected = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] [<Relation causes [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] ['subj', 'obj']>]>" assert str(doc) == expected
def test_document_str(): doc1 = kindred.Document( '<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.' ) mapping1 = doc1.getSourceEntityIDsToEntityIDs() expected1 = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' id=%d sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' id=%d sourceid=T2 [(33, 39)]>] []>" % ( mapping1["T1"], mapping1["T2"]) assert str(doc1) == expected1 assert doc1.__repr__() == expected1 doc2 = kindred.Document( '<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.<relation type="causes" subj="T2" obj="T1" />' ) mapping2 = doc2.getSourceEntityIDsToEntityIDs() expected2 = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' id=%d sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' id=%d sourceid=T2 [(33, 39)]>] [<Relation causes [%d, %d] ['obj', 'subj']>]>" % ( mapping2["T1"], mapping2["T2"], mapping2["T1"], mapping2["T2"]) assert str(doc2) == expected2 assert doc2.__repr__() == expected2
def parseJSON(data, ignoreEntities=[]): entities = [] relations = [] text = data['text'] if 'denotations' in data: for d in data['denotations']: sourceEntityID = None if 'id' in d: sourceEntityID = d['id'] entityType = d['obj'] span = d['span'] startPos, endPos = span['begin'], span['end'] position = [(startPos, endPos)] entityText = text[startPos:endPos] if not entityType in ignoreEntities: entity = kindred.Entity(entityType, entityText, position, sourceEntityID=sourceEntityID) entities.append(entity) if 'relations' in data: for r in data['relations']: obj = r['obj'] relationType = r['pred'] subj = r['subj'] entityIDs = [obj, subj] argNames = ['obj', 'subj'] relation = kindred.Relation(relationType=relationType, entityIDs=entityIDs, argNames=argNames) relations.append(relation) expected = [ 'denotations', 'divid', 'modifications', 'namespaces', 'project', 'relations', 'sourcedb', 'sourceid', 'target', 'text', 'tracks' ] extraFields = [k for k in data.keys() if not k in expected] assert len(extraFields ) == 0, "Found additional unexpected fields (%s) in JSON" % ( ",".join(extraFields)) combinedData = kindred.Document(text, entities=entities, relations=relations) return combinedData
def loadDataFromSTFormat(txtFile, a1File, a2File, verbose=False, ignoreEntities=[], ignoreComplexRelations=True): assert ignoreComplexRelations == True, "ignoreComplexRelations must be True as kindred doesn't currently support complex relations" with codecs.open(txtFile, "r", "utf-8") as f: text = f.read() entities = [] with codecs.open(a1File, "r", "utf-8") as f: for line in f: if line.strip() == '': continue assert line[ 0] == 'T', "Only triggers are expected in a1 file: " + a1File entity = loadEntity(line.strip(), text) if (not entity is None) and (not entity.entityType in ignoreEntities): entities.append(entity) relations = [] if os.path.exists(a2File): with codecs.open(a2File, "r", "utf-8") as f: for line in f: if line.strip() == '': continue if line[0] == 'E' or line[0] == 'R': relation = loadRelation(line.strip(), ignoreComplexRelations) if not relation is None: relations.append(relation) elif verbose: sys.stderr.write("Unable to process line: %s\n" % line.strip()) elif verbose: sys.stderr.write("Note: No A2 file found : %s\n" % os.path.basename(a2File)) baseTxtFile = os.path.basename(txtFile) baseFilename = baseTxtFile[0:-4] combinedData = kindred.Document(text, entities=entities, relations=relations, sourceFilename=baseFilename) return combinedData
def __init__(self, text=None): """ Constructor :param text: Optional SimpleTag text to initalize a single document :type text: String (with SimpleTag format XML) """ self.documents = [] if not text is None: doc = kindred.Document(text) self.addDocument(doc) self.parsed = False self.relationTypes = None
def parseJSON(data,ignoreEntities=[]): entities = [] relations = [] if isinstance(data,list): assert len(data) == 1 and isinstance(data[0],dict), "JSON loading expects a dictionary or a list with one dictionary in it" data = data[0] assert isinstance(data,dict), "JSON loading expects a dictionary or a list with one dictionary in it" text = data['text'] if 'denotations' in data: for d in data['denotations']: sourceEntityID = None if 'id' in d: sourceEntityID = d['id'] entityType = d['obj'] span = d['span'] startPos,endPos = span['begin'],span['end'] position = [(startPos,endPos)] entityText = text[startPos:endPos] if not entityType in ignoreEntities: entity = kindred.Entity(entityType,entityText,position,sourceEntityID=sourceEntityID) entities.append(entity) sourceEntityIDToEntity = { entity.sourceEntityID:entity for entity in entities } if 'relations' in data: for r in data['relations']: obj = r['obj'] relationType = r['pred'] subj = r['subj'] sourceEntityIDs = [obj,subj] argNames = ['obj','subj'] entitiesInRelation = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ] relation = kindred.Relation(relationType,entitiesInRelation,argNames) relations.append(relation) expected = ['denotations','divid','modifications','namespaces','project','relations','sourcedb','sourceid','target','text','tracks'] extraFields = [ k for k in data.keys() if not k in expected] assert len(extraFields) == 0, "Found additional unexpected fields (%s) in JSON" % (",".join(extraFields)) combinedData = kindred.Document(text,entities=entities,relations=relations) return combinedData
def __init__(self, text=None, loadFromSimpleTag=False): """ Create an empty corpus with no documents, or quickly load one with a single document using optional SimpleTag :param text: Optional SimpleTag text to initalize a single document :param loadFromSimpleTag: If text is provided, whether the text parameter is in the SimpleTag format and will extract entities and relations accordingly :type text: String (with SimpleTag format XML) :type loadFromSimpleTag: bool """ self.documents = [] if not text is None: doc = kindred.Document(text, loadFromSimpleTag=loadFromSimpleTag) self.addDocument(doc) self.parsed = False
def loadDataFromStandoff(txtFile,ignoreEntities=[],ignoreComplexRelations=True): annotationExtensions = ['ann','a1','a2'] assert ignoreComplexRelations == True, "ignoreComplexRelations must be True as kindred doesn't currently support complex relations" with codecs.open(txtFile, "r", "utf-8") as f: text = f.read() assert txtFile.endswith('.txt') base = txtFile[:-4] annotationFiles = [ "%s.%s" % (base,ext) for ext in annotationExtensions ] annotationFiles = [ filename for filename in annotationFiles if os.path.isfile(filename) ] entities = [] for annotationFile in annotationFiles: with codecs.open(annotationFile, "r", "utf-8") as f: for line in f: if line.startswith('T'): entity = loadEntity(annotationFile,line.strip(), text) if (not entity is None) and (not entity.entityType in ignoreEntities): entities.append(entity) sourceEntityIDToEntity = { entity.sourceEntityID:entity for entity in entities } relations = [] for annotationFile in annotationFiles: with codecs.open(annotationFile, "r", "utf-8") as f: for line in f: if line.startswith('E') or line.startswith('R'): relationTuple = loadRelation(annotationFile,line.strip(),ignoreComplexRelations) if not relationTuple is None: relationType,sourceEntityIDs,argNames = relationTuple for sourceEntityID in sourceEntityIDs: assert sourceEntityID in sourceEntityIDToEntity, "Relation exists that references a non-existent entity (%s) associated with %s" % (sourceEntityID,txtFile) entitiesInRelation = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ] relation = kindred.Relation(relationType,entitiesInRelation,argNames) relations.append(relation) baseTxtFile = os.path.basename(txtFile) baseFilename = baseTxtFile[0:-4] combinedData = kindred.Document(text,entities=entities,relations=relations,sourceFilename=baseFilename) return combinedData
def __init__(self, text=None, loadFromSimpleTag=False): """ Constructor :param text: Optional SimpleTag text to initalize a single document :param loadFromSimpleTag: If text is provided, whether the text parameter is in the SimpleTag format and will extract entities and relations accordingly :type text: String (with SimpleTag format XML) :type loadFromSimpleTag: bool """ self.documents = [] if not text is None: doc = kindred.Document(text, loadFromSimpleTag=loadFromSimpleTag) self.addDocument(doc) self.parsed = False self.candidateRelationsEntityCounts = set() self.relationTypes = None
def test_corpus_split(): mainCorpus = kindred.Corpus() for i in range(100): doc = kindred.Document(text=str(i), entities=[]) mainCorpus.addDocument(doc) corpusA, corpusB = mainCorpus.split(0.75) assert len(corpusA.documents) == 75 assert len(corpusB.documents) == 25 seen = set() for doc in corpusA.documents: assert doc in mainCorpus.documents, "This document doesn't match an existing one" assert not doc in seen, "This document isn't unique now" seen.add(doc) for doc in corpusB.documents: assert doc in mainCorpus.documents, "This document doesn't match an existing one" assert not doc in seen, "This document isn't unique now" seen.add(doc) assert len(seen) == len(mainCorpus.documents)
def parseSimpleTag(text, ignoreEntities=[]): docText = u"<doc>%s</doc>" % text xmldoc = minidom.parseString(docText.encode('utf8')) docNode = xmldoc.childNodes[0] text, unmergedEntities, relations = parseSimpleTag_helper( docNode, ignoreEntities=ignoreEntities) missingSourceEntityID = [e.sourceEntityID == '' for e in unmergedEntities] assert all(missingSourceEntityID) or ( not any(missingSourceEntityID) ), 'All entities or none (not some) should be given IDs' assert (not any(missingSourceEntityID)) or len( relations) == 0, "Cannot include relations with no-ID entities" if all(missingSourceEntityID): for i, e in enumerate(unmergedEntities): e.sourceEntityID = i + 1 entities = mergeEntitiesWithMatchingIDs(unmergedEntities) combinedData = kindred.Document(text, entities=entities, relations=relations) return combinedData
def convertBiocDocToKindredDocs(document): assert isinstance(document, bioc.BioCDocument) kindredDocs = [] for passage in document.passages: assert isinstance(passage, bioc.BioCPassage) text = passage.text offset = int(native(passage.offset)) entities = [] relations = [] for a in passage.annotations: assert isinstance(a, bioc.BioCAnnotation) entityType = a.infons['type'] sourceEntityID = a.id metadata = a.infons del metadata['type'] position = [] segments = [] for l in a.locations: assert isinstance(l, bioc.BioCLocation) startPos = int(native(l.offset)) - offset endPos = startPos + int(native(l.length)) assert startPos >= 0 and startPos <= len( text ) and endPos >= 0 and endPos <= len( text ), "Entity offsets (offset=%s,length=%s) are outside the span of the text (%s)" % ( str(l.offset), str(l.length), passage.text) position.append((startPos, endPos)) segments.append(text[startPos:endPos]) entityText = " ".join(segments) assert entityText == a.text, "Mismatch in entity annotation between expected text (%s) and extracted text (%s) using offset info for passage with text: %s" % ( a.text, entityText, text) e = kindred.Entity(entityType, entityText, position, sourceEntityID, metadata=metadata) entities.append(e) sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } for r in passage.relations: assert isinstance(r, bioc.BioCRelation) relationType = r.infons['type'] arguments = [] for n in r.nodes: assert isinstance(n, bioc.BioCNode) arguments.append((n.role, n.refid)) arguments = sorted(arguments) argNames = [argName for argName, sourceEntityID in arguments] sourceEntityIDs = [ sourceEntityID for argName, sourceEntityID in arguments ] for sourceEntityID in sourceEntityIDs: assert sourceEntityID in sourceEntityIDToEntity, "Relation references entity %s which does not exist in BioC document id=%s" % ( sourceEntityID, str(document.id)) entities = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ] r = kindred.Relation(relationType, entities, argNames) relations.append(r) metadata = dict(document.infons) metadata.update(passage.infons) metadata['id'] = document.id relData = kindred.Document(text, entities=entities, relations=relations, metadata=metadata) kindredDocs.append(relData) return kindredDocs
for doc in documents: title_plus_abstract = doc['title'] + "\n" + doc['abstract'] if title_plus_abstract in existing_mapping: already_parsed.append(existing_mapping[title_plus_abstract]) else: needs_parsing.append(title_plus_abstract) needs_parsing = sorted(set(needs_parsing)) if use_previous_parses: print("Found %d documents with existing parses" % len(already_parsed)) print("Found %d documents to parse" % len(needs_parsing)) sys.stdout.flush() corpus = kindred.Corpus() for title_plus_abstract in needs_parsing: kindred_doc = kindred.Document(title_plus_abstract) corpus.addDocument(kindred_doc) print("Parsing...") sys.stdout.flush() parser = kindred.Parser(model='en_core_sci_sm') parser.parse(corpus) corpus.documents += already_parsed print("Saving %d parses..." % len(corpus.documents)) sys.stdout.flush() with open(args.outPickle,'wb') as outF: pickle.dump(corpus,outF)