def test_saveStandoffFile_fromSimpleTag(): text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />' corpus = kindred.Corpus() doc = kindred.Document(text) corpus.addDocument(doc) tempDir = tempfile.mkdtemp() kindred.save(corpus,'standoff',tempDir) loadedCorpus = kindred.loadDir('standoff',tempDir) assert isinstance(loadedCorpus,kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc,kindred.Document) entities = loadedDoc.getEntities() relations = loadedDoc.getRelations() sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations shutil.rmtree(tempDir)
def test_saveStandoffFile_SeparateSentences(): texts = ['The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />','<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />'] corpus = kindred.Corpus() for t in texts: doc = kindred.Document(t) corpus.addDocument(doc) tempDir = tempfile.mkdtemp() kindred.save(corpus,'standoff',tempDir) loadedCorpus = kindred.loadDir('standoff',tempDir) assert isinstance(loadedCorpus,kindred.Corpus) assert len(loadedCorpus.documents) == 2 data = loadedCorpus.documents[0] assert isinstance(data,kindred.Document) entities = data.getEntities() relations = data.getRelations() sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations data = loadedCorpus.documents[1] assert isinstance(data,kindred.Document) entities = data.getEntities() relations = data.getRelations() sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='Li-Fraumeni',expectedPos=[(0,11)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='P53',expectedPos=[(39,42)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations shutil.rmtree(tempDir)
def test_saveStandoffFile_noArgNames(): text = "The colorectal cancer was caused by mutations in APC" e1 = kindred.Entity(entityType="disease",text="colorectal cancer",position=[(4, 21)],sourceEntityID="T1") e2 = kindred.Entity(entityType="gene",text="APC",position=[(49, 52)],sourceEntityID="T2") rel = kindred.Relation(relationType="causes",entityIDs=[e1.entityID,e2.entityID]) doc = kindred.Document(text,[e1,e2],[rel],relationsUseSourceIDs=False) corpus = kindred.Corpus() corpus.addDocument(doc) tempDir = tempfile.mkdtemp() kindred.save(corpus,'standoff',tempDir) loadedCorpus = kindred.loadDir('standoff',tempDir) assert isinstance(loadedCorpus,kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc,kindred.Document) entities = loadedDoc.getEntities() relations = loadedDoc.getRelations() sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['arg1','arg2'])], "(%s) not as expected" % relations shutil.rmtree(tempDir)
def test_iterLoadBiocFile(): text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />' corpus = kindred.Corpus(text,loadFromSimpleTag=True) docsToCreate = 100 tempDir = tempfile.mkdtemp() singleDoc = corpus.documents[0] corpus.documents = [ singleDoc for _ in range(docsToCreate) ] kindred.save(corpus,'bioc',tempDir) biocPath = os.path.join(tempDir,'collection.bioc.xml') totalDocCount = 0 for corpus in kindred.iterLoadDataFromBioc(biocPath,corpusSizeCutoff=3): assert isinstance(corpus,kindred.Corpus) assert len(corpus.documents) <= 25 totalDocCount += len(corpus.documents) for doc in corpus.documents: assert isinstance(doc,kindred.Document) entities = doc.getEntities() relations = doc.getRelations() sourceEntityIDsToEntityIDs = doc.getSourceEntityIDsToEntityIDs() assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1") assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2") assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations assert totalDocCount == docsToCreate shutil.rmtree(tempDir)
def test_saveStandoffFile_noSourceEntityID(): text = 'The <disease>colorectal cancer</disease> is bad.' corpus = kindred.Corpus(text, loadFromSimpleTag=True) with TempDir() as tempDir: with pytest.raises(AssertionError) as excinfo: kindred.save(corpus, 'standoff', tempDir) assert excinfo.value.args[ 0] == 'Entities must have a sourceEntityID (e.g. T1) to be saved in the standoff format'
def test_saveStandoffFile(): text = "The colorectal cancer was caused by mutations in APC" e1 = kindred.Entity(entityType="disease", text="colorectal cancer", position=[(4, 21)], sourceEntityID="T1") e2 = kindred.Entity(entityType="gene", text="APC", position=[(49, 52)], sourceEntityID="T2") rel = kindred.Relation(relationType="causes", entities=[e1, e2], argNames=['obj', 'subj']) doc = kindred.Document(text, [e1, e2], [rel]) corpus = kindred.Corpus() corpus.addDocument(doc) with TempDir() as tempDir: kindred.save(corpus, 'standoff', tempDir) for filename in os.listdir(tempDir): if filename.endswith('.a2'): checkRelationAnnotations(os.path.join(tempDir, filename)) loadedCorpus = kindred.load('standoff', tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc, kindred.Document) entities = loadedDoc.entities relations = loadedDoc.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['obj', 'subj'], sourceRelationID='R1') ], "(%s) not as expected" % relations
def test_saveBB3Data(): corpus = kindred.bionlpst.load('2016-BB3-event-train') assert isinstance(corpus,kindred.Corpus) tempDir = tempfile.mkdtemp() kindred.save(corpus,'standoff',tempDir) loadedCorpus = kindred.loadDir('standoff',tempDir) assert len(corpus.documents) == len(loadedCorpus.documents) shutil.rmtree(tempDir)
def test_saveBB3Data(): corpus = kindred.bionlpst.load('2016-BB3-event-train') assert isinstance(corpus, kindred.Corpus) with TempDir() as tempDir: kindred.save(corpus, 'standoff', tempDir) for filename in os.listdir(tempDir): if filename.endswith('.a2'): checkRelationAnnotations(os.path.join(tempDir, filename)) loadedCorpus = kindred.load('standoff', tempDir) assert len(corpus.documents) == len(loadedCorpus.documents)
def test_iterLoadBiocFile(): text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />' corpus = kindred.Corpus(text, loadFromSimpleTag=True) docsToCreate = 100 with TempDir() as tempDir: singleDoc = corpus.documents[0] corpus.documents = [singleDoc for _ in range(docsToCreate)] tempFile = os.path.join(tempDir, 'corpus.bioc.xml') kindred.save(corpus, 'biocxml', tempFile) totalDocCount = 0 for corpus in kindred.iterLoad('biocxml', tempFile, corpusSizeCutoff=3): assert isinstance(corpus, kindred.Corpus) assert len(corpus.documents) <= 25 totalDocCount += len(corpus.documents) for doc in corpus.documents: assert isinstance(doc, kindred.Document) entities = doc.entities relations = doc.relations sourceEntityIDsToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation('causes', [ sourceEntityIDsToEntity["T1"], sourceEntityIDsToEntity["T2"] ], ['obj', 'subj']) ], "(%s) not as expected" % relations assert totalDocCount == docsToCreate
def test_saveStandoffFile_fromSimpleTag_triple(): text = '<drug id="T1">Erlotinib</drug>, a <gene id="T2">EGFR</gene> inhibitor is commonly used for <disease id="T3">NSCLC</disease> patients. <relation type="druginfo" drug="T1" gene="T2" disease="T3" />' corpus = kindred.Corpus(text, loadFromSimpleTag=True) with TempDir() as tempDir: kindred.save(corpus, 'standoff', tempDir) for filename in os.listdir(tempDir): if filename.endswith('.a2'): checkRelationAnnotations(os.path.join(tempDir, filename)) loadedCorpus = kindred.load('standoff', tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc, kindred.Document) entities = loadedDoc.entities relations = loadedDoc.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='drug', expectedText='Erlotinib', expectedPos=[(0, 9)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='EGFR', expectedPos=[(13, 17)], expectedSourceEntityID="T2") assertEntity(entities[2], expectedType='disease', expectedText='NSCLC', expectedPos=[(49, 54)], expectedSourceEntityID="T3") assert relations == [ kindred.Relation('druginfo', [ sourceEntityIDToEntity["T3"], sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"] ], ['disease', 'drug', 'gene'], sourceRelationID='R1') ], "(%s) not as expected" % relations
def test_saveStandoffFile_fromSimpleTag_binary(): text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />' corpus = kindred.Corpus(text, loadFromSimpleTag=True) with TempDir() as tempDir: kindred.save(corpus, 'standoff', tempDir) for filename in os.listdir(tempDir): if filename.endswith('.a2'): checkRelationAnnotations(os.path.join(tempDir, filename)) loadedCorpus = kindred.load('standoff', tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc, kindred.Document) entities = loadedDoc.entities relations = loadedDoc.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") print([r.sourceRelationID for r in relations]) assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['obj', 'subj'], sourceRelationID='R1') ], "(%s) not as expected" % relations
def test_saveStandoffFile_fromSimpleTag_triple(): text = '<drug id="T1">Erlotinib</drug>, a <gene id="T2">EGFR</gene> inhibitor is commonly used for <disease id="T3">NSCLC</disease> patients. <relation type="druginfo" drug="T1" gene="T2" disease="T3" />' corpus = kindred.Corpus(text, loadFromSimpleTag=True) tempDir = tempfile.mkdtemp() kindred.save(corpus, 'standoff', tempDir) loadedCorpus = kindred.loadDir('standoff', tempDir) shutil.rmtree(tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc, kindred.Document) entities = loadedDoc.getEntities() relations = loadedDoc.getRelations() sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs() assertEntity(entities[0], expectedType='drug', expectedText='Erlotinib', expectedPos=[(0, 9)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='EGFR', expectedPos=[(13, 17)], expectedSourceEntityID="T2") assertEntity(entities[2], expectedType='disease', expectedText='NSCLC', expectedPos=[(49, 54)], expectedSourceEntityID="T3") assert relations == [ kindred.Relation('druginfo', [ sourceEntityIDsToEntityIDs["T3"], sourceEntityIDsToEntityIDs["T1"], sourceEntityIDsToEntityIDs["T2"] ], ['disease', 'drug', 'gene']) ], "(%s) not as expected" % relations
def test_savePubAnnotationFile_fromSimpleTag(): text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />' corpus = kindred.Corpus(text, loadFromSimpleTag=True) with TempDir() as tempDir: tempFile = os.path.join(tempDir, 'corpus.json') kindred.save(corpus, 'pubannotation', tempFile) loadedCorpus = kindred.load('pubannotation', tempFile) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 1 loadedDoc = loadedCorpus.documents[0] assert isinstance(loadedDoc, kindred.Document) entities = loadedDoc.entities relations = loadedDoc.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") print([r.sourceRelationID for r in relations]) assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['subj', 'obj'], sourceRelationID='R1') ], "(%s) not as expected" % relations
def _bionlpst_seedev_testSet(): trainData = kindred.bionlpst.load('2016-SeeDev-binary-train') devData = kindred.bionlpst.load('2016-SeeDev-binary-dev') testData = kindred.bionlpst.load('2016-SeeDev-binary-test') trainAndDevData = trainData + devData print("Starting training...") classifier = kindred.RelationClassifier() classifier.train(trainAndDevData) print("Predicting training...") predictedRelations = classifier.predict( testData) #devData_TextAndEntities) print("Saving...") outDir = 'out.SeeDev' kindred.save(testData, 'standoff', outDir, predictedRelations=predictedRelations)
def _bionlpst_bb3_testSet(): trainData = kindred.bionlpst.load('2016-BB3-event-train') devData = kindred.bionlpst.load('2016-BB3-event-dev') testData = kindred.bionlpst.load('2016-BB3-event-test') trainAndDevData = trainData + devData print("Starting training...") classifier = kindred.RelationClassifier(useBuilder=True) #classifier = RelationClassifier(useBuilder=False) classifier.train(trainAndDevData) print("Predicting training...") predictedRelations = classifier.predict( testData) #devData_TextAndEntities) print("Saving...") outDir = 'out.BB3' kindred.save(testData, 'standoff', outDir, predictedRelations=predictedRelations)
entityTypes = set([ entity.entityType for entity, tokenIndices in sentence.entityAnnotations ]) entityInfo = [(e.entityType, e.text) for e, tokenIndices in sentence.entityAnnotations ] hasMutation = "Mutation" in entityTypes hasChemical = "Chemical" in entityTypes if hasMutation and hasChemical: sentenceStart = sentence.tokens[0].startPos sentenceEntities = [ kindred.Entity(e.entityType, e.text, [(e.position[0][0] - sentenceStart, e.position[0][1] - sentenceStart)], e.sourceEntityID, e.externalID, metadata=e.metadata) for e, _ in sentence.entityAnnotations ] newDoc = kindred.Document(sentence.text, sentenceEntities, metadata=doc.metadata) sentenceCorpus.addDocument(newDoc) kindred.save(sentenceCorpus, 'biocxml', args.outBioc)
import os if __name__ == '__main__': parser = argparse.ArgumentParser(description='Use annotated sentences to build a Kindred classifer and apply to unannotated sentences') parser.add_argument('--dataToBuildModel',required=True,type=str,help='Sentences with relations') parser.add_argument('--dataToApplyModel',required=True,type=str,help='Sentences without annotated relations to make predictions on') parser.add_argument('--outDir',required=True,type=str,help='Directory to store output') args = parser.parse_args() print("Loading corpora...") trainCorpus = kindred.load('standoff',args.dataToBuildModel) predictionCorpus = kindred.load('standoff',args.dataToApplyModel) print("Building classifier...") classifier = kindred.RelationClassifier() classifier.train(trainCorpus) print("Applying classifier...") classifier.predict(predictionCorpus) if not os.path.isdir(args.outDir): os.makedirs(args.outDir) print("Saving results to directory...") kindred.save(predictionCorpus,'standoff',args.outDir) print("\nPredicted relations:") for relation in predictionCorpus.getRelations(): print("%s\t%s" % (relation.entities[0].text,relation.entities[1].text))
sentenceCorpus = corpus.splitIntoSentences() print("Looking for measurement words, e.g. voltage") wordlist = { ('voltage', ): {('measurement', 'voltage')}, ('current', ): {('measurement', 'current')} } entityRecognizer = kindred.EntityRecognizer(wordlist) entityRecognizer.annotate(sentenceCorpus) print("Looking for numeric values") quantityRecognizer = QuantityRecognizer() quantityRecognizer.annotate(sentenceCorpus) print("Find every pair of a measurement word and a value") candidateBuilder = kindred.CandidateBuilder( acceptedEntityTypes=[('measurement', 'quantity')]) candidateRelations = candidateBuilder.build(sentenceCorpus) print("Let's annotate a few") withRelations, noRelations = kindred.manuallyAnnotate( sentenceCorpus, candidateRelations) outDir = 'numericalAnnotations' if not os.path.isdir(outDir): os.makedirs(outDir) print("Saving results to directory...") kindred.save(withRelations, 'standoff', outDir)
wordlistLookup = kindred.EntityRecognizer.loadWordlists(wordlistDict, idColumn=0, termsColumn=0) print("Annotating entities in corpus with wordlists") entityRecognizer = kindred.EntityRecognizer(wordlistLookup) entityRecognizer.annotate(sentenceCorpus) print("Finding all candidate relations") acceptedEntityTypes = wordlistDict candidateBuilder = kindred.CandidateBuilder( entityCount=len(wordlistDict), acceptedEntityTypes=[tuple(sorted(wordlistDict.keys()))]) candidateRelations = candidateBuilder.build(sentenceCorpus) print( "Time to through some of the candidate relations and annotate some...") annotatedCorpus, unannotatedCorpus = kindred.manuallyAnnotate( sentenceCorpus, candidateRelations) print( "\nSaving annotated corpus of %d sentences (with relations that you have just annotated)" % len(annotatedCorpus.documents)) kindred.save(annotatedCorpus, 'standoff', annotatedDir) print( "Saving unannotated corpus of %d sentences (which you did not review)" % len(unannotatedCorpus.documents)) kindred.save(unannotatedCorpus, 'standoff', unannotatedDir)
def test_saveStandoffFile_SeparateSentences(): texts = [ 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />', '<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />' ] corpus = kindred.Corpus() for t in texts: doc = kindred.Document(t, loadFromSimpleTag=True) corpus.addDocument(doc) with TempDir() as tempDir: kindred.save(corpus, 'standoff', tempDir) for filename in os.listdir(tempDir): if filename.endswith('.a2'): checkRelationAnnotations(os.path.join(tempDir, filename)) loadedCorpus = kindred.load('standoff', tempDir) assert isinstance(loadedCorpus, kindred.Corpus) assert len(loadedCorpus.documents) == 2 data = loadedCorpus.documents[0] assert isinstance(data, kindred.Document) entities = data.entities relations = data.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='colorectal cancer', expectedPos=[(4, 21)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='APC', expectedPos=[(49, 52)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['obj', 'subj']) ], "(%s) not as expected" % relations data = loadedCorpus.documents[1] assert isinstance(data, kindred.Document) entities = data.entities relations = data.relations sourceEntityIDToEntity = { entity.sourceEntityID: entity for entity in entities } assertEntity(entities[0], expectedType='disease', expectedText='Li-Fraumeni', expectedPos=[(0, 11)], expectedSourceEntityID="T1") assertEntity(entities[1], expectedType='gene', expectedText='P53', expectedPos=[(39, 42)], expectedSourceEntityID="T2") assert relations == [ kindred.Relation( 'causes', [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]], ['obj', 'subj']) ], "(%s) not as expected" % relations
if not (e.entityType == 'Chemical' and not e.metadata['conceptid'] in drugMeshIDs) ] doc.entities = [ e for e in doc.entities if not (e.entityType == 'Chemical' and len(e.text) <= 4) ] doc.entities = [ e for e in doc.entities if not (e.entityType == 'Mutation' and pgmine.normalizeMutation(e.text) is None) ] doc.entities = [ e for e in doc.entities if not (e.entityType == 'Mutation' and e.text.lower() in variantStopwords) ] entityTypes = set(e.entityType for e in doc.entities) if 'Chemical' in entityTypes and 'Mutation' in entityTypes: filtered.append(doc) corpus.documents += filtered print("Found: ", len(corpus.documents)) corpus.documents = random.sample(corpus.documents, 500) kindred.save(corpus, 'standoff', args.outDir) kindred.save(corpus, 'biocxml', os.path.join(args.outDir, 'corpus.bioc.xml'))