Exemple #1
0
def test_saveStandoffFile_noArgNames():
	text = "The colorectal cancer was caused by mutations in APC"
	e1 = kindred.Entity(entityType="disease",text="colorectal cancer",position=[(4, 21)],sourceEntityID="T1")
	e2 = kindred.Entity(entityType="gene",text="APC",position=[(49, 52)],sourceEntityID="T2")
	rel = kindred.Relation(relationType="causes",entityIDs=[e1.entityID,e2.entityID])
	doc = kindred.Document(text,[e1,e2],[rel],relationsUseSourceIDs=False)
	corpus = kindred.Corpus()
	corpus.addDocument(doc)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)

	assert isinstance(loadedCorpus,kindred.Corpus)
	assert len(loadedCorpus.documents) == 1
	loadedDoc = loadedCorpus.documents[0]
	
	assert isinstance(loadedDoc,kindred.Document)
	entities = loadedDoc.getEntities()
	relations = loadedDoc.getRelations()

	sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs()

	assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['arg1','arg2'])], "(%s) not as expected" % relations
	
	shutil.rmtree(tempDir)
Exemple #2
0
def load(taskName,ignoreEntities=[]):
	"""
	Download and load the corresponding corpus from the BioNLP Shared Task
	
	:param taskName: The name of the shared task to download (e.g. 'BioNLP-ST-2016_BB-event_train'). Use kindred.bionlpst.listTasks() to get a list of valid options
	:param ignoreEntities: A list of any entities that should be ignored during loading
	:type taskName: str
	:type ignoreEntities: list of str
	:return: The loaded corpus
	:rtype: kindred.Corpus
	"""
	global taskOptions

	tempDir = tempfile.mkdtemp()

	assert taskName in taskOptions.keys(), "%s not a valid option in %s" % (taskName, taskOptions.keys())
	url,expectedFile,expectedSHA256 = taskOptions[taskName]
	filesToDownload = [(url,expectedFile,expectedSHA256)]
	expectedDir = expectedFile.replace('.zip','')

	try:
		kindred.utils._downloadFiles(filesToDownload,tempDir)
	except:
		exc_info = sys.exc_info()
		shutil.rmtree(tempDir)
		six.reraise(*exc_info)

	mainDir = kindred.utils._findDir(expectedDir,tempDir)

	corpus = kindred.loadDir(dataFormat='standoff',directory=mainDir,ignoreEntities=ignoreEntities)

	shutil.rmtree(tempDir)

	return corpus
Exemple #3
0
def test_saveStandoffFile_fromSimpleTag():
	text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />'
	corpus = kindred.Corpus()
	doc = kindred.Document(text)
	corpus.addDocument(doc)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)

	assert isinstance(loadedCorpus,kindred.Corpus)
	assert len(loadedCorpus.documents) == 1
	loadedDoc = loadedCorpus.documents[0]
	
	assert isinstance(loadedDoc,kindred.Document)
	entities = loadedDoc.getEntities()
	relations = loadedDoc.getRelations()

	sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs()

	assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations
	
	shutil.rmtree(tempDir)
Exemple #4
0
def test_saveStandoffFile_SeparateSentences():
	texts = ['The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />','<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />']
	corpus = kindred.Corpus()
	for t in texts:
		doc = kindred.Document(t)
		corpus.addDocument(doc)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)

	assert isinstance(loadedCorpus,kindred.Corpus)
	assert len(loadedCorpus.documents) == 2
	
	data = loadedCorpus.documents[0]
	assert isinstance(data,kindred.Document)
	entities = data.getEntities()
	relations = data.getRelations()
	sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs()
	assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations
	
	data = loadedCorpus.documents[1]
	assert isinstance(data,kindred.Document)
	entities = data.getEntities()
	relations = data.getRelations()
	sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs()
	assertEntity(entities[0],expectedType='disease',expectedText='Li-Fraumeni',expectedPos=[(0,11)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='P53',expectedPos=[(39,42)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations
	
	shutil.rmtree(tempDir)
Exemple #5
0
def test_loadBiocFile_dir():
    scriptDir = os.path.dirname(__file__)
    dataPath = os.path.join(scriptDir, 'data')

    corpus = kindred.loadDir(dataFormat='bioc', directory=dataPath)

    assert isinstance(corpus, kindred.Corpus)
    assert len(corpus.documents) == 1
    data = corpus.documents[0]

    assert isinstance(data, kindred.Document)
    entities = data.getEntities()
    relations = data.getRelations()

    sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs()

    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation('causes', [
            sourceEntityIDsToEntityIDs["T1"], sourceEntityIDsToEntityIDs["T2"]
        ], ['obj', 'subj'])
    ], "(%s) not as expected" % relations
Exemple #6
0
def test_loadEmptyDirectory():
	tempDir = tempfile.mkdtemp()
	for dataformat in ['standoff','simpletag','json','bioc']:
		with pytest.raises(RuntimeError) as excinfo:
			corpus = kindred.loadDir(dataformat,tempDir)
		expectedError = 'No documents loaded from directory (%s/). Are you sure this directory contains the corpus (format: %s)' % (tempDir.rstrip('/'),dataformat)
		assert excinfo.value.args == (expectedError ,)

	shutil.rmtree(tempDir)
Exemple #7
0
def test_saveBB3Data():
	corpus = kindred.bionlpst.load('2016-BB3-event-train')
	assert isinstance(corpus,kindred.Corpus)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)
	assert len(corpus.documents) == len(loadedCorpus.documents)

	shutil.rmtree(tempDir)
Exemple #8
0
def test_saveStandoffFile_fromSimpleTag_triple():
    text = '<drug id="T1">Erlotinib</drug>, a <gene id="T2">EGFR</gene> inhibitor is commonly used for <disease id="T3">NSCLC</disease> patients. <relation type="druginfo" drug="T1" gene="T2" disease="T3" />'
    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    tempDir = tempfile.mkdtemp()

    kindred.save(corpus, 'standoff', tempDir)

    loadedCorpus = kindred.loadDir('standoff', tempDir)
    shutil.rmtree(tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 1
    loadedDoc = loadedCorpus.documents[0]

    assert isinstance(loadedDoc, kindred.Document)
    entities = loadedDoc.getEntities()
    relations = loadedDoc.getRelations()

    sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs()

    assertEntity(entities[0],
                 expectedType='drug',
                 expectedText='Erlotinib',
                 expectedPos=[(0, 9)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='EGFR',
                 expectedPos=[(13, 17)],
                 expectedSourceEntityID="T2")
    assertEntity(entities[2],
                 expectedType='disease',
                 expectedText='NSCLC',
                 expectedPos=[(49, 54)],
                 expectedSourceEntityID="T3")
    assert relations == [
        kindred.Relation('druginfo', [
            sourceEntityIDsToEntityIDs["T3"], sourceEntityIDsToEntityIDs["T1"],
            sourceEntityIDsToEntityIDs["T2"]
        ], ['disease', 'drug', 'gene'])
    ], "(%s) not as expected" % relations
Exemple #9
0
    )
    parser.add_argument('--reltype',
                        type=str,
                        required=True,
                        help='Relation type to analyze. Must be one of %s' %
                        reltypes)
    parser.add_argument('--outCurve',
                        type=str,
                        required=True,
                        help='File to output curve data to')
    args = parser.parse_args()

    with open(args.outCurve, 'w') as outF:
        outF.write("%s\t%s\t%s\n" % ('threshold', 'precision', 'recall'))
        for threshold in [-0.1] + list(np.arange(0, 1, 0.01)) + [1.0]:
            train = kindred.loadDir('standoff', args.trainDir)
            gold = kindred.loadDir('standoff', args.testDir)

            # Trim back to relation type of choice
            for doc in train.documents:
                doc.relations = [
                    r for r in doc.relations if r.relationType == args.reltype
                ]
            for doc in gold.documents:
                doc.relations = [
                    r for r in doc.relations if r.relationType == args.reltype
                ]

            entityType = entityTypes[args.reltype]
            entityCount = len(entityType)
            classifier = kindred.RelationClassifier(