def loadDataFromStandoff(txtFile, ignoreEntities=[], ignoreComplexRelations=True): annotationExtensions = ['ann', 'a1', 'a2'] assert ignoreComplexRelations == True, "ignoreComplexRelations must be True as kindred doesn't currently support complex relations" with codecs.open(txtFile, "r", "utf-8") as f: text = f.read() assert txtFile.endswith('.txt') base = txtFile[:-4] annotationFiles = ["%s.%s" % (base, ext) for ext in annotationExtensions] annotationFiles = [ filename for filename in annotationFiles if os.path.isfile(filename) ] entities = [] for annotationFile in annotationFiles: with codecs.open(annotationFile, "r", "utf-8") as f: for line in f: if line.startswith('T'): entity = loadEntity(annotationFile, line.strip(), text) if (not entity is None) and (not entity.entityType in ignoreEntities): entities.append(entity) sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } relations = [] for annotationFile in annotationFiles: with codecs.open(annotationFile, "r", "utf-8") as f: for line in f: if line.startswith('E') or line.startswith('R'): relationTuple = loadRelation(annotationFile, line.strip(), ignoreComplexRelations) if not relationTuple is None: sourceRelationID, relationType, sourceEntityIDs, argNames = relationTuple for sourceEntityID in sourceEntityIDs: assert sourceEntityID in sourceEntityIDToEntity, "Relation exists that references a non-existent entity (%s) associated with %s" % ( sourceEntityID, txtFile) entitiesInRelation = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ] relation = kindred.Relation( relationType, entitiesInRelation, argNames, sourceRelationID=sourceRelationID) relations.append(relation) baseTxtFile = os.path.basename(txtFile) baseFilename = baseTxtFile[0:-4] combinedData = kindred.Document(text, entities=entities, relations=relations, sourceFilename=baseFilename) return combinedData
def convertBiocDocToKindredDocs(document): assert isinstance(document, bioc.BioCDocument) kindredDocs = [] for passage in document.passages: assert isinstance(passage, bioc.BioCPassage) text = passage.text offset = int(native(passage.offset)) entities = [] relations = [] for a in passage.annotations: assert isinstance(a, bioc.BioCAnnotation) entityType = a.infons['type'] sourceEntityID = a.id metadata = a.infons del metadata['type'] position = [] segments = [] for l in a.locations: assert isinstance(l, bioc.BioCLocation) startPos = int(native(l.offset)) endPos = startPos + int(native(l.length)) position.append((startPos, endPos)) segments.append(text[startPos:endPos]) entityText = " ".join(segments) e = kindred.Entity(entityType, entityText, position, sourceEntityID, metadata=metadata) entities.append(e) sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } for r in passage.relations: assert isinstance(r, bioc.BioCRelation) relationType = r.infons['type'] arguments = [] for n in r.nodes: assert isinstance(n, bioc.BioCNode) arguments.append((n.role, n.refid)) arguments = sorted(arguments) argNames = [argName for argName, sourceEntityID in arguments] sourceEntityIDs = [ sourceEntityID for argName, sourceEntityID in arguments ] for sourceEntityID in sourceEntityIDs: assert sourceEntityID in sourceEntityIDToEntity, "Relation references entity %s which does not exist in BioC document id=%s" % ( sourceEntityID, str(document.id)) entities = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ] r = kindred.Relation(relationType, entities, argNames) relations.append(r) metadata = dict(document.infons) metadata.update(passage.infons) metadata['id'] = document.id relData = kindred.Document(text, entities=entities, relations=relations, metadata=metadata) kindredDocs.append(relData) return kindredDocs
def predict(self,corpus): """ Use the relation classifier to predict new relations for a corpus. The new relations will be added to the Corpus. :param corpus: Corpus to make predictions on :type corpus: kindred.Corpus """ assert self.isTrained, "Classifier must be trained using train() before predictions can be made" assert isinstance(corpus,kindred.Corpus) if not corpus.parsed: parser = kindred.Parser(model=self.model) parser.parse(corpus) candidateRelations = self.candidateBuilder.build(corpus) # Check if there are any candidate relations to classify in this corpus if len(candidateRelations) == 0: return predictedRelations = [] testVectors = self.vectorizer.transform(candidateRelations) classMatrix = self.clf.predict(testVectors) if self.clf.has_predict_proba(): probMatrix = self.clf.predict_proba(testVectors) else: probMatrix = None predictedProb = None for matrixRow,matrixCol in zip(*classMatrix.nonzero()): candidateRelation = candidateRelations[matrixRow] if probMatrix is not None: predictedProb = probMatrix[matrixRow,matrixCol] relKey = self.colToRelType[matrixCol] relType = relKey[0] argNames = relKey[1:] candidateRelationEntityTypes = tuple( [ e.entityType for e in candidateRelation.entities ] ) if not tuple(candidateRelationEntityTypes) in self.relTypeToValidEntityTypes[relKey]: continue predictedRelation = kindred.Relation(relType,candidateRelation.entities,argNames=argNames,probability=predictedProb) predictedRelations.append(predictedRelation) # Add the predicted relations into the corpus entitiesToDoc = {} for i,doc in enumerate(corpus.documents): for e in doc.entities: entitiesToDoc[e] = i for predictedRelation in predictedRelations: docIDs = [ entitiesToDoc[e] for e in predictedRelation.entities ] docIDs = list(set(docIDs)) assert len(docIDs) > 0, "Predicted relation contains entities that don't match any documents in corpus" assert len(docIDs) == 1, "Predicted relation contains entities that are spread across documents" docID = docIDs[0] if not predictedRelation in corpus.documents[docID].relations: corpus.documents[docID].addRelation(predictedRelation)
def parseJSON(data, ignoreEntities=[]): entities = [] relations = [] if isinstance(data, list): assert len(data) == 1 and isinstance( data[0], dict ), "JSON loading expects a dictionary or a list with one dictionary in it" data = data[0] assert isinstance( data, dict ), "JSON loading expects a dictionary or a list with one dictionary in it" text = data['text'] if 'denotations' in data: for d in data['denotations']: sourceEntityID = None if 'id' in d: sourceEntityID = d['id'] entityType = d['obj'] span = d['span'] startPos, endPos = span['begin'], span['end'] position = [(startPos, endPos)] entityText = text[startPos:endPos] if not entityType in ignoreEntities: entity = kindred.Entity(entityType, entityText, position, sourceEntityID=sourceEntityID) entities.append(entity) sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } if 'relations' in data: for r in data['relations']: obj = r['obj'] relationType = r['pred'] subj = r['subj'] sourceEntityIDs = [obj, subj] argNames = ['obj', 'subj'] entitiesInRelation = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ] relation = kindred.Relation(relationType, entitiesInRelation, argNames) relations.append(relation) expected = [ 'denotations', 'divid', 'modifications', 'namespaces', 'project', 'relations', 'sourcedb', 'sourceid', 'target', 'text', 'tracks' ] extraFields = [k for k in data.keys() if not k in expected] assert len(extraFields ) == 0, "Found additional unexpected fields (%s) in JSON" % ( ",".join(extraFields)) combinedData = kindred.Document(text, entities=entities, relations=relations) return combinedData
def test_saveStandoffFile_SeparateSentences(): texts = [ 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />', '<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />' ] corpus = kindred.Corpus() for t in texts: doc = kindred.Document(t, loadFromSimpleTag=True) corpus.addDocument(doc) tempDir = tempfile.mkdtemp() kindred.save(corpus, 'standoff', tempDir) loadedCorpus = kindred.loadDir('standoff', tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 2 data = loadedCorpus.documents[0] assert isinstance(data, kindred.Document) entities = data.getEntities() relations = data.getRelations() sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs() assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation('causes', [ sourceEntityIDsToEntityIDs["T1"], sourceEntityIDsToEntityIDs["T2"] ], ['obj', 'subj']) ], "(%s) not as expected" % relations data = loadedCorpus.documents[1] assert isinstance(data, kindred.Document) entities = data.getEntities() relations = data.getRelations() sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs() assertEntity(entities[0], expectedType='disease', expectedText='Li-Fraumeni', expectedPos=[(0, 11)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='P53', expectedPos=[(39, 42)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation('causes', [ sourceEntityIDsToEntityIDs["T1"], sourceEntityIDsToEntityIDs["T2"] ], ['obj', 'subj']) ], "(%s) not as expected" % relations shutil.rmtree(tempDir)
def test_saveStandoffFile_SeparateSentences(): texts = [ 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />', '<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />' ] corpus = kindred.Corpus() for t in texts: doc = kindred.Document(t, loadFromSimpleTag=True) corpus.addDocument(doc) with TempDir() as tempDir: kindred.save(corpus, 'standoff', tempDir) for filename in os.listdir(tempDir): if filename.endswith('.a2'): checkRelationAnnotations(os.path.join(tempDir, filename)) loadedCorpus = kindred.load('standoff', tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 2 data = loadedCorpus.documents[0] assert isinstance(data, kindred.Document) entities = data.entities relations = data.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['obj', 'subj']) ], "(%s) not as expected" % relations data = loadedCorpus.documents[1] assert isinstance(data, kindred.Document) entities = data.entities relations = data.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='Li-Fraumeni', expectedPos=[(0, 11)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='P53', expectedPos=[(39, 42)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['obj', 'subj']) ], "(%s) not as expected" % relations
def splitIntoSentences(self): """ Create a new corpus with one document for each sentence in this document. :return: Corpus with one document per sentence :rtype: kindred.Corpus """ sentenceCorpus = kindred.Corpus() for sentence in self.sentences: sentenceStart = sentence.tokens[0].startPos entitiesInSentence = [ entity for entity, tokenIndices in sentence.entityAnnotations ] entityMap = OrderedDict() for e in entitiesInSentence: startPos, endPos = e.position[0] newPosition = [(startPos - sentenceStart, endPos - sentenceStart)] newE = kindred.Entity(e.entityType, e.text, newPosition, e.sourceEntityID, e.externalID) entityMap[e] = newE relationsInSentence = [ r for r in self.relations if all(e in entitiesInSentence for e in r.entities) ] newRelationsInSentence = [] for r in relationsInSentence: newEntitiesInRelation = [entityMap[e] for e in r.entities] newRelation = kindred.Relation(r.relationType, newEntitiesInRelation, r.argNames, r.probability) newRelationsInSentence.append(newRelation) newEntitiesInSentence = list(entityMap.values()) doc = kindred.Document(sentence.text, newEntitiesInSentence, newRelationsInSentence) newTokens = [ kindred.Token(t.word, t.lemma, t.partofspeech, t.startPos - sentenceStart, t.endPos - sentenceStart) for t in sentence.tokens ] newSentence = kindred.Sentence(sentence.text, newTokens, sentence.dependencies, sentence.sourceFilename) newEntityAnnotations = [ (entityMap[e], tokenIndices) for e, tokenIndices in sentence.entityAnnotations ] newSentence.entityAnnotations = newEntityAnnotations doc.sentences = [newSentence] sentenceCorpus.addDocument(doc) return sentenceCorpus
def __init__(self, text, entities=None, relations=None, relationsUseSourceIDs=True, sourceFilename=None, metadata={}, loadFromSimpleTag=False): """ Constructor for a Document that can take text using the SimpleTag XML format, or a set of Entities and Relations with associated text. :param text: Text in document (plain-text, or SimpleTag) :param entities: Entities in document :param relations: Relations in document :param relationsUseSourceIDs: description :param sourceFilename: description :param metadata: IDs and other information associated with the source (e.g. PMID) :param loadFromSimpleTag: Assumes the text parameter is in the SimpleTag format and will extract entities and relations accordingly :type text: type description :type entities: type description :type relations: type description :type relationsUseSourceIDs: type description :type sourceFilename: type description :type metadata: dict :type loadFromSimpleTag: bool """ self.sourceFilename = sourceFilename self.metadata = metadata if loadFromSimpleTag: assert entities is None and relations is None, 'Entities and relations will be extracted from SimpleTag. They cannot also be passed in as parameters' dataToCopy = kindred.loadFunctions.parseSimpleTag(text) self.text = dataToCopy.getText() self.entities = dataToCopy.getEntities() self.relations = dataToCopy.getRelations() else: self.text = text if entities is None: self.entities = [] else: assert isinstance(entities, list) for e in entities: assert isinstance(e, kindred.Entity) self.entities = entities if relations is None: self.relations = [] else: assert isinstance(relations, list) for r in relations: assert isinstance(r, kindred.Relation) self.relations = relations # We'll need to translate source IDs to internal IDs if relationsUseSourceIDs and not loadFromSimpleTag: sourceEntityIDsToEntityIDs = self.getSourceEntityIDsToEntityIDs() sourceEntityIDs = sourceEntityIDsToEntityIDs.keys() correctedRelations = [] for r in self.relations: for e in r.entityIDs: assert e in sourceEntityIDs, "Entities in relation must occur in the associated text. %s does not" % e relationEntityIDs = [ sourceEntityIDsToEntityIDs[e] for e in r.entityIDs ] correctedR = kindred.Relation(r.relationType, relationEntityIDs, r.argNames) correctedRelations.append(correctedR) self.relations = correctedRelations self.sentences = []