Exemple #1
0
def test_saveStandoffFile_noArgNames():
	text = "The colorectal cancer was caused by mutations in APC"
	e1 = kindred.Entity(entityType="disease",text="colorectal cancer",position=[(4, 21)],sourceEntityID="T1")
	e2 = kindred.Entity(entityType="gene",text="APC",position=[(49, 52)],sourceEntityID="T2")
	rel = kindred.Relation(relationType="causes",entityIDs=[e1.entityID,e2.entityID])
	doc = kindred.Document(text,[e1,e2],[rel],relationsUseSourceIDs=False)
	corpus = kindred.Corpus()
	corpus.addDocument(doc)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)

	assert isinstance(loadedCorpus,kindred.Corpus)
	assert len(loadedCorpus.documents) == 1
	loadedDoc = loadedCorpus.documents[0]
	
	assert isinstance(loadedDoc,kindred.Document)
	entities = loadedDoc.getEntities()
	relations = loadedDoc.getRelations()

	sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs()

	assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['arg1','arg2'])], "(%s) not as expected" % relations
	
	shutil.rmtree(tempDir)
Exemple #2
0
def test_entity_equals():
    e1 = kindred.Entity(entityType="drug",
                        text="Erlotinib",
                        position=[(0, 9)],
                        sourceEntityID=None)
    e2 = kindred.Entity(entityType="drug",
                        text="Erlotinib",
                        position=[(0, 9)],
                        sourceEntityID="T16")
    e3 = kindred.Entity(entityType="drug",
                        text="Erlotinib",
                        position=[(0, 9)],
                        sourceEntityID=None)

    rel1 = kindred.Relation(relationType="causes",
                            entityIDs=[1, 2],
                            argNames=None)

    assert e1 == e1
    assert e1 != e2
    assert e1 != e3
    assert e2 == e2
    assert e2 != e3

    assert e1 != rel1
    assert e2 != rel1
    assert e3 != rel1
Exemple #3
0
def test_document_init():
    text = "Cancer is caused by mutations in ABCDE1."
    e1 = kindred.Entity('disease', 'Cancer', [(0, 6)], 'T1')
    e2 = kindred.Entity('gene', 'ABCDE1', [(33, 39)], 'T2')

    doc = kindred.Document(text, [e1, e2])

    expected = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] []>"
    assert str(doc) == expected
Exemple #4
0
def test_saveStandoffFile():
    text = "The colorectal cancer was caused by mutations in APC"
    e1 = kindred.Entity(entityType="disease",
                        text="colorectal cancer",
                        position=[(4, 21)],
                        sourceEntityID="T1")
    e2 = kindred.Entity(entityType="gene",
                        text="APC",
                        position=[(49, 52)],
                        sourceEntityID="T2")
    rel = kindred.Relation(relationType="causes",
                           entities=[e1, e2],
                           argNames=['obj', 'subj'])
    doc = kindred.Document(text, [e1, e2], [rel])
    corpus = kindred.Corpus()
    corpus.addDocument(doc)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 1
    loadedDoc = loadedCorpus.documents[0]

    assert isinstance(loadedDoc, kindred.Document)
    entities = loadedDoc.entities
    relations = loadedDoc.relations

    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['obj', 'subj'],
            sourceRelationID='R1')
    ], "(%s) not as expected" % relations
Exemple #5
0
def test_document_init_withRel():
    text = "Cancer is caused by mutations in ABCDE1."
    e1 = kindred.Entity('disease', 'Cancer', [(0, 6)], 'T1')
    e2 = kindred.Entity('gene', 'ABCDE1', [(33, 39)], 'T2')
    rel = kindred.Relation('causes', [e1, e2], ['subj', 'obj'])

    doc = kindred.Document(text, [e1, e2], [rel])

    expected = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] [<Relation causes [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] ['subj', 'obj']>]>"
    assert str(doc) == expected
Exemple #6
0
def test_relation_repr():
	e1 = kindred.Entity('mutation','BRAF V600E mutation',[])
	e2 = kindred.Entity('event','vemurafenib resistance',[])

	rel1 = kindred.Relation(relationType="causes",entities=[e1,e2],argNames=None)
	rel2 = kindred.Relation(relationType="causes",entities=[e1,e2],argNames=["drug","disease"])

	expected1 = "<Relation causes [<Entity mutation:'BRAF V600E mutation' sourceid=None []>, <Entity event:'vemurafenib resistance' sourceid=None []>] None>"
	expected2 = "<Relation causes [<Entity mutation:'BRAF V600E mutation' sourceid=None []>, <Entity event:'vemurafenib resistance' sourceid=None []>] ['drug', 'disease']>"

	assert rel1.__repr__() == expected1
	assert rel2.__repr__() == expected2
Exemple #7
0
def test_relation_repr():
	e1 = kindred.Entity('mutation','BRAF V600E mutation',[])
	e2 = kindred.Entity('event','vemurafenib resistance',[])

	rel1 = kindred.CandidateRelation(entities=[e1,e2])
	rel2 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[("causes",["drug","disease"])])

	expected1 = "<CandidateRelation [<Entity mutation:'BRAF V600E mutation' sourceid=None []>, <Entity event:'vemurafenib resistance' sourceid=None []>] []>"
	expected2 = "<CandidateRelation [<Entity mutation:'BRAF V600E mutation' sourceid=None []>, <Entity event:'vemurafenib resistance' sourceid=None []>] [('causes', ['drug', 'disease'])]>"

	assert rel1.__repr__() == expected1
	assert rel2.__repr__() == expected2
Exemple #8
0
def test_relation_hash():
	e1 = kindred.Entity('mutation','BRAF V600E mutation',[])
	e2 = kindred.Entity('event','vemurafenib resistance',[])

	rel1 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[])
	rel2 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[])
	rel3 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[("causes",["drug","disease"])])
	rel4 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[("causes",["drug","disease"])])

	assert hash(rel1) == hash(rel2)
	assert hash(rel3) == hash(rel4)
	assert hash(rel1) != hash(rel3)
Exemple #9
0
def test_entity_str():
    e1 = kindred.Entity(entityType="drug",
                        text="Erlotinib",
                        position=[(0, 9)],
                        sourceEntityID=None)
    e2 = kindred.Entity(entityType="drug",
                        text="Erlotinib",
                        position=[(0, 9)],
                        sourceEntityID="T16")

    expected1 = "<Entity drug:'Erlotinib' sourceid=None [(0, 9)]>"
    expected2 = "<Entity drug:'Erlotinib' sourceid=T16 [(0, 9)]>"
    assert str(e1) == expected1
    assert str(e2) == expected2
Exemple #10
0
def test_entity_repr():
    e1 = kindred.Entity(entityType="drug",
                        text="Erlotinib",
                        position=[(0, 9)],
                        sourceEntityID=None)
    e2 = kindred.Entity(entityType="drug",
                        text="Erlotinib",
                        position=[(0, 9)],
                        sourceEntityID="T16")

    expected1 = "<Entity drug:'Erlotinib' id=%d sourceid=None [(0, 9)]>" % e1.entityID
    expected2 = "<Entity drug:'Erlotinib' id=%d sourceid=T16 [(0, 9)]>" % e2.entityID
    assert e1.__repr__() == expected1
    assert e2.__repr__() == expected2
Exemple #11
0
def test_sentence_addEntityWithLocations(capfd):
    text = 'lots of mutations cause dangerous cancer'
    tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()]

    s = kindred.Sentence(text,
                         tokens,
                         dependencies=[(2, 3, 'a'), (3, 5, 'b'), (4, 5, 'c')])

    e1 = kindred.Entity('thingA', 'mutations', [(0, 1)])
    e2 = kindred.Entity('thingB', 'cancer', [(0, 1)])

    s.addEntityAnnotation(e1, [2])
    s.addEntityAnnotation(e2, [5])

    assert s.entityAnnotations == [(e1, [2]), (e2, [5])]
Exemple #12
0
def test_relation_hash():
	e1 = kindred.Entity('mutation','BRAF V600E mutation',[])
	e2 = kindred.Entity('event','vemurafenib resistance',[])

	rel1 = kindred.Relation(relationType="causes",entities=[e1,e2],argNames=None)
	rel2 = kindred.Relation(relationType="causes",entities=[e1,e2],argNames=None)
	rel3 = kindred.Relation(relationType="causes",entities=[e1,e2],argNames=["drug","disease"])
	rel4 = kindred.Relation(relationType="causes",entities=[e1,e2],argNames=["drug","disease"])

	assert hash(rel1) == hash(rel2)
	assert hash(rel3) == hash(rel4)
	assert hash(rel1) != hash(rel3)

	assert hash(rel1) == hash((rel1.relationType,tuple(rel1.entities),rel1.probability))
	assert hash(rel3) == hash((rel3.relationType,tuple(rel3.entities),tuple(rel3.argNames),rel3.probability))
Exemple #13
0
def test_sentence_str(capfd):
    text = 'lots of mutations cause dangerous cancer'
    tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()]

    e1 = kindred.Entity('thingA', 'mutations', [(0, 1)])
    e2 = kindred.Entity('thingB', 'cancer', [(0, 1)])

    entitiesWithLocations = [(e1, [2]), (e2, [5])]

    s = kindred.Sentence(text,
                         tokens,
                         dependencies=[(2, 3, 'a'), (3, 5, 'b'), (4, 5, 'c')],
                         entitiesWithLocations=entitiesWithLocations)

    assert s.__repr__() == "lots of mutations cause dangerous cancer"
Exemple #14
0
def annotateStarAlleles(corpus):
    for doc in corpus.documents:

        genes = [e for e in doc.entities if e.entityType == 'Gene']

        for gene in genes:
            geneEnd = gene.position[0][1]
            geneID = gene.metadata['conceptid']

            offset = geneEnd
            regex = '^(,|and|or|/|\s|\+)*(?P<main>\*\s*[0-9]([\w:]*\w+)?)'
            star = re.search(regex, doc.text[offset:])
            while star:
                _, length = star.span()

                startPos, endPos = star.span('main')
                text = star.group('main')

                sourceEntityID = getNextSourceEntityID(doc)
                alleleName = text.strip()[1:].strip()

                conceptid = '*%s' % alleleName

                starAllele = kindred.Entity(
                    'Mutation',
                    text, [(offset + startPos, offset + endPos)],
                    sourceEntityID=sourceEntityID,
                    metadata={
                        'conceptid': conceptid,
                        'associated_gene': geneID
                    })
                doc.addEntity(starAllele)

                offset += length
                star = re.search(regex, doc.text[offset:])
Exemple #15
0
def test_relation_equals():
    rel1 = kindred.Relation(relationType="causes",
                            entityIDs=[1, 2],
                            argNames=None)
    rel2 = kindred.Relation(relationType="causes",
                            entityIDs=[1, 2],
                            argNames=None)
    rel3 = kindred.Relation(relationType="causes",
                            entityIDs=[1, 2],
                            argNames=["drug", "disease"])
    rel4 = kindred.Relation(relationType="causes",
                            entityIDs=[1, 2],
                            argNames=["drug", "disease"])

    e1 = kindred.Entity(entityType="drug",
                        text="Erlotinib",
                        position=[(0, 9)],
                        sourceEntityID=None)

    assert rel1 == rel2
    assert rel1 != rel3
    assert rel1 != rel4
    assert rel2 != rel3
    assert rel3 == rel4

    assert rel1 != e1
    assert rel2 != e1
    assert rel3 != e1
    assert rel4 != e1
Exemple #16
0
def test_entity_str_withExternalID():
    e1 = kindred.Entity(entityType="drug",
                        text="Erlotinib",
                        position=[(0, 9)],
                        sourceEntityID=None,
                        externalID="id:1234")
    e2 = kindred.Entity(entityType="drug",
                        text="Erlotinib",
                        position=[(0, 9)],
                        sourceEntityID="T16",
                        externalID="id:9876")

    expected1 = "<Entity drug:'Erlotinib' id=%d sourceid=None externalid=id:1234 [(0, 9)]>" % e1.entityID
    expected2 = "<Entity drug:'Erlotinib' id=%d sourceid=T16 externalid=id:9876 [(0, 9)]>" % e2.entityID
    assert str(e1) == expected1
    assert str(e2) == expected2
Exemple #17
0
def convertBiocDocToKindredDocs(document):
	assert isinstance(document,bioc.BioCDocument)
	kindredDocs = []
	for passage in document.passages:
		assert isinstance(passage,bioc.BioCPassage)
		
		text = passage.text
		offset = int(native(passage.offset))
		entities = []
		relations = []
		
		for a in passage.annotations:
			assert isinstance(a,bioc.BioCAnnotation)
			
			entityType = a.infons['type']
			sourceEntityID = a.id
			
			position = []
			segments = []
			
			for l in a.locations:
				assert isinstance(l,bioc.BioCLocation)
				startPos = int(native(l.offset)) - offset
				endPos = startPos + int(native(l.length))
				position.append((startPos,endPos))
				segments.append(text[startPos:endPos])
			
			entityText = " ".join(segments)
			e = kindred.Entity(entityType,entityText,position,sourceEntityID)
			entities.append(e)

		sourceEntityIDToEntity = { entity.sourceEntityID:entity for entity in entities }
			
		for r in passage.relations:
			assert isinstance(r,bioc.BioCRelation)
			relationType = r.infons['type']
			
			arguments = []
			for n in r.nodes:
				assert isinstance(n,bioc.BioCNode)
				arguments.append((n.role,n.refid))
			arguments = sorted(arguments)
				
			argNames = [ argName for argName,sourceEntityID in arguments]
			sourceEntityIDs = [ sourceEntityID for argName,sourceEntityID in arguments]
			for sourceEntityID in sourceEntityIDs:
				assert sourceEntityID in sourceEntityIDToEntity, "Relation references entity %s which does not exist in BioC document id=%s" % (sourceEntityID,str(document.id))

			entities = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ]
			
			r = kindred.Relation(relationType,entities,argNames)
			relations.append(r)
		
		metadata = dict(document.infons)
		metadata.update(passage.infons)
		metadata['id'] = document.id
		relData = kindred.Document(text,entities=entities,relations=relations,metadata=metadata)
		kindredDocs.append(relData)

	return kindredDocs
Exemple #18
0
    def annotate(self, corpus):
        """
		Annotate a parsed corpus with the wordlist lookup and other entity types

		:param corpus: Corpus to annotate
		:type corpus: kindred.Corpus
		"""

        assert corpus.parsed == True, "Corpus must already be parsed before entity recognition"

        for doc in corpus.documents:
            for sentence in doc.sentences:
                words = [t.word for t in sentence.tokens]

                extractedTermData = self._processWords(words)

                for locs, terms, termtypesAndids in extractedTermData:
                    #text = " ".join(terms)
                    startToken = locs[0]
                    endToken = locs[1]
                    startPos = sentence.tokens[startToken].startPos
                    endPos = sentence.tokens[endToken - 1].endPos
                    text = doc.text[startPos:endPos]
                    loc = list(range(startToken, endToken))
                    for entityType, externalID in termtypesAndids:
                        e = kindred.Entity(entityType,
                                           text, [(startPos, endPos)],
                                           externalID=externalID)
                        doc.addEntity(e)
                        sentence.addEntityWithLocation(e, loc)
Exemple #19
0
    def annotate(self, corpus):
        """
		Annotate a parsed corpus with the wordlist lookup and other entity types

		:param corpus: Corpus to annotate
		:type corpus: kindred.Corpus
		"""

        assert corpus.parsed == True, "Corpus must already be parsed before entity recognition"

        for doc in corpus.documents:
            entityCount = len(doc.entities)
            for sentence in doc.sentences:

                extractedTermData = self._processWords(sentence)

                for locs, terms, termtypesAndids in extractedTermData:
                    startToken = locs[0]
                    endToken = locs[1]
                    startPos = sentence.tokens[startToken].startPos
                    endPos = sentence.tokens[endToken - 1].endPos
                    text = doc.text[startPos:endPos]
                    loc = list(range(startToken, endToken))
                    for entityType, externalID in termtypesAndids:
                        sourceEntityID = "T%d" % (entityCount + 1)

                        e = kindred.Entity(entityType,
                                           text, [(startPos, endPos)],
                                           externalID=externalID,
                                           sourceEntityID=sourceEntityID)
                        #doc.addEntity(e)
                        doc.entities.append(e)
                        sentence.addEntityAnnotation(e, loc)
                        entityCount += 1
Exemple #20
0
    def annotate(self, corpus):
        """
		Annotate a corpus for numerical values

		:param corpus: Corpus to annotate
		:type corpus: kindred.Corpus
		"""

        assert corpus.parsed == True, "Corpus must already be parsed before entity recognition"

        for doc in corpus.documents:
            entityCount = len(doc.entities)
            for sentence in doc.sentences:
                words = [t.word for t in sentence.tokens]

                for i, t in enumerate(sentence.tokens):
                    if not isNumber(t.word):
                        continue

                    sourceEntityID = "T%d" % (entityCount + 1)
                    text = doc.text[t.startPos:t.endPos]
                    loc = [i]

                    e = kindred.Entity('quantity',
                                       text, [(t.startPos, t.endPos)],
                                       sourceEntityID=sourceEntityID)
                    doc.addEntity(e)
                    sentence.addEntityAnnotation(e, loc)
                    entityCount += 1
Exemple #21
0
def parseSimpleTag_helper(node,currentPosition=0,ignoreEntities=[]):
	text,entities,relationTuples = '',[],[]
	for s in node.childNodes:
		if s.nodeType == s.ELEMENT_NODE:
			insideText,insideEntities,insideRelationTuples = parseSimpleTag_helper(s,currentPosition+len(text))

			if s.tagName == 'relation':
				relationType = s.getAttribute('type')
				arguments = [ (argName,entityID) for argName,entityID in s.attributes.items() if argName != 'type' ]
				arguments = sorted(arguments)
				
				sourceEntityIDs = [ sourceEntityID for argName,sourceEntityID in arguments]
				argNames = [ argName for argName,sourceEntityID in arguments]
				

				relationTuple = (relationType,sourceEntityIDs,argNames)
				relationTuples.append(relationTuple)
			else: # Entity
				entityType = s.tagName
				sourceEntityID = s.getAttribute('id')
				position = [(currentPosition+len(text),currentPosition+len(text)+len(insideText))]

				assert len(insideText) > 0, "Name (text inside tags) is empty for entity of type %s" % entityType

				if not entityType in ignoreEntities:
					e = kindred.Entity(entityType,insideText,position,sourceEntityID=sourceEntityID)
					entities.append(e)
				
			text += insideText
			entities += insideEntities
			relationTuples += insideRelationTuples
		elif s.nodeType == s.TEXT_NODE:
			text += s.nodeValue
			
	return text,entities,relationTuples
Exemple #22
0
def loadEntity(filename,line,text):
	assert line[0] == 'T', "ERROR in %s. Entity input should start with a T" % filename
	split = line.strip().split('\t')
	assert len(split) == 3, "ERROR in %s" % filename
	entityID = split[0]
	typeInfo = split[1]
	tokens = split[2]
		
	textChunks = []
	typeSpacePos = typeInfo.index(' ')
	typeName = typeInfo[:typeSpacePos]
	positionText = typeInfo[typeSpacePos:]
	positions = []
	for coordinates in positionText.strip().split(';'):
		a,b = coordinates.strip().split(' ')
		a,b = int(a.strip()),int(b.strip())
		textChunk = text[a:b].replace('\n',' ').strip()
		textChunks.append(textChunk)
		positions.append((a,b))
		
	# Check that the tokens match up to the text
	chunkTest = " ".join(textChunks)
	tokensTest = tokens
	chunkTest = re.sub(r'\s\s+', ' ', chunkTest)
	tokensTest = re.sub(r'\s\s+', ' ', tokensTest)
	chunkTest = chunkTest.strip()
	tokensTest = tokensTest.strip()

	assert chunkTest == tokensTest , u"ERROR in " + filename + u"For id=" + entityID + ", tokens '" + tokens.encode('ascii', 'ignore') + "' don't match up with positions: " + str(positions)
	
	entity = kindred.Entity(typeName, tokensTest, positions, entityID)

	return entity
Exemple #23
0
def test_document_addEntity():
    text = "Cancer is caused by mutations in ABCDE1."

    doc = kindred.Document(text, [])

    e1 = kindred.Entity('disease', 'Cancer', [(0, 6)], 'T1')
    doc.addEntity(e1)

    expected = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>] []>"
    assert str(doc) == expected
Exemple #24
0
def test_relation_triple_equals():
	e1 = kindred.Entity('mutation','BRAF V600E mutation',[])
	e2 = kindred.Entity('event','vemurafenib resistance',[])
	e3 = kindred.Entity('citation','28028924',[])

	rel1 = kindred.CandidateRelation(entities=[e1,e2,e3])
	rel2 = kindred.CandidateRelation(entities=[e1,e2,e3])
	rel3 = kindred.CandidateRelation(entities=[e1,e2,e3],knownTypesAndArgNames=[("causes",["whathappened","whatdiditcause","citation"])])
	rel4 = kindred.CandidateRelation(entities=[e1,e2,e3],knownTypesAndArgNames=[("causes",["whathappened","whatdiditcause","citation"])])

	assert rel1 == rel2
	assert rel1 != rel3
	assert rel1 != rel4
	assert rel2 != rel3
	assert rel3 == rel4

	assert rel1 != e1
	assert rel2 != e1
	assert rel3 != e1
	assert rel4 != e1
Exemple #25
0
def test_relation_equals():
	e1 = kindred.Entity('mutation','BRAF V600E mutation',[])
	e2 = kindred.Entity('event','vemurafenib resistance',[])

	rel1 = kindred.CandidateRelation(entities=[e1,e2])
	rel2 = kindred.CandidateRelation(entities=[e1,e2])
	rel3 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[("causes",["drug","disease"])])
	rel4 = kindred.CandidateRelation(entities=[e1,e2],knownTypesAndArgNames=[("causes",["drug","disease"])])
	
	e1 = kindred.Entity(entityType="drug",text="Erlotinib",position=[(0,9)],sourceEntityID=None)

	assert rel1 == rel2
	assert rel1 != rel3
	assert rel1 != rel4
	assert rel2 != rel3
	assert rel3 == rel4

	assert rel1 != e1
	assert rel2 != e1
	assert rel3 != e1
	assert rel4 != e1
Exemple #26
0
def parseJSON(data, ignoreEntities=[]):
    entities = []
    relations = []

    text = data['text']
    if 'denotations' in data:
        for d in data['denotations']:
            sourceEntityID = None
            if 'id' in d:
                sourceEntityID = d['id']

            entityType = d['obj']
            span = d['span']
            startPos, endPos = span['begin'], span['end']
            position = [(startPos, endPos)]
            entityText = text[startPos:endPos]

            if not entityType in ignoreEntities:
                entity = kindred.Entity(entityType,
                                        entityText,
                                        position,
                                        sourceEntityID=sourceEntityID)
                entities.append(entity)
    if 'relations' in data:
        for r in data['relations']:
            obj = r['obj']
            relationType = r['pred']
            subj = r['subj']

            entityIDs = [obj, subj]
            argNames = ['obj', 'subj']

            relation = kindred.Relation(relationType=relationType,
                                        entityIDs=entityIDs,
                                        argNames=argNames)
            relations.append(relation)

    expected = [
        'denotations', 'divid', 'modifications', 'namespaces', 'project',
        'relations', 'sourcedb', 'sourceid', 'target', 'text', 'tracks'
    ]
    extraFields = [k for k in data.keys() if not k in expected]
    assert len(extraFields
               ) == 0, "Found additional unexpected fields (%s) in JSON" % (
                   ",".join(extraFields))

    combinedData = kindred.Document(text,
                                    entities=entities,
                                    relations=relations)

    return combinedData
Exemple #27
0
def parseJSON(data,ignoreEntities=[]):
	entities = []
	relations = []

	if isinstance(data,list):
		assert len(data) == 1 and isinstance(data[0],dict), "JSON loading expects a dictionary or a list with one dictionary in it"
		data = data[0]
	assert isinstance(data,dict), "JSON loading expects a dictionary or a list with one dictionary in it"

	text = data['text']
	if 'denotations' in data:
		for d in data['denotations']:
			sourceEntityID = None
			if 'id' in d:
				sourceEntityID = d['id']
			
			entityType = d['obj']
			span = d['span']
			startPos,endPos = span['begin'],span['end']
			position = [(startPos,endPos)]
			entityText = text[startPos:endPos]
			
			if not entityType in ignoreEntities:
				entity = kindred.Entity(entityType,entityText,position,sourceEntityID=sourceEntityID)
				entities.append(entity)

	sourceEntityIDToEntity = { entity.sourceEntityID:entity for entity in entities }

	if 'relations' in data:
		for r in data['relations']:
			obj = r['obj']
			relationType = r['pred']
			subj = r['subj']
			
			sourceEntityIDs = [obj,subj]
			argNames = ['obj','subj']
			entitiesInRelation = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ]
		
			relation = kindred.Relation(relationType,entitiesInRelation,argNames)
			relations.append(relation)
	
	expected = ['denotations','divid','modifications','namespaces','project','relations','sourcedb','sourceid','target','text','tracks']
	extraFields = [ k for k in data.keys() if not k in expected]
	assert len(extraFields) == 0, "Found additional unexpected fields (%s) in JSON" % (",".join(extraFields))
		
	combinedData = kindred.Document(text,entities=entities,relations=relations)

	return combinedData
Exemple #28
0
    def splitIntoSentences(self):
        """
		Create a new corpus with one document for each sentence in this document.

		:return: Corpus with one document per sentence
		:rtype: kindred.Corpus
		"""

        sentenceCorpus = kindred.Corpus()

        for sentence in self.sentences:
            sentenceStart = sentence.tokens[0].startPos

            entitiesInSentence = [
                entity for entity, tokenIndices in sentence.entityAnnotations
            ]

            entityMap = OrderedDict()
            for e in entitiesInSentence:
                startPos, endPos = e.position[0]
                newPosition = [(startPos - sentenceStart,
                                endPos - sentenceStart)]
                newE = kindred.Entity(e.entityType, e.text, newPosition,
                                      e.sourceEntityID, e.externalID)
                entityMap[e] = newE

            relationsInSentence = [
                r for r in self.relations
                if all(e in entitiesInSentence for e in r.entities)
            ]
            newRelationsInSentence = []
            for r in relationsInSentence:
                newEntitiesInRelation = [entityMap[e] for e in r.entities]
                newRelation = kindred.Relation(r.relationType,
                                               newEntitiesInRelation,
                                               r.argNames, r.probability)
                newRelationsInSentence.append(newRelation)

            newEntitiesInSentence = list(entityMap.values())
            doc = kindred.Document(sentence.text, newEntitiesInSentence,
                                   newRelationsInSentence)

            newTokens = [
                kindred.Token(t.word, t.lemma, t.partofspeech,
                              t.startPos - sentenceStart,
                              t.endPos - sentenceStart)
                for t in sentence.tokens
            ]

            newSentence = kindred.Sentence(sentence.text, newTokens,
                                           sentence.dependencies,
                                           sentence.sourceFilename)
            newEntityAnnotations = [
                (entityMap[e], tokenIndices)
                for e, tokenIndices in sentence.entityAnnotations
            ]
            newSentence.entityAnnotations = newEntityAnnotations
            doc.sentences = [newSentence]

            sentenceCorpus.addDocument(doc)

        return sentenceCorpus
Exemple #29
0
def convertBiocDocToKindredDocs(document):
    assert isinstance(document, bioc.BioCDocument)
    kindredDocs = []
    for passage in document.passages:
        assert isinstance(passage, bioc.BioCPassage)

        text = passage.text
        offset = int(native(passage.offset))
        entities = []
        relations = []

        for a in passage.annotations:
            assert isinstance(a, bioc.BioCAnnotation)

            entityType = a.infons['type']
            sourceEntityID = a.id

            metadata = a.infons
            del metadata['type']

            position = []
            segments = []

            for l in a.locations:
                assert isinstance(l, bioc.BioCLocation)
                startPos = int(native(l.offset)) - offset
                endPos = startPos + int(native(l.length))

                assert startPos >= 0 and startPos <= len(
                    text
                ) and endPos >= 0 and endPos <= len(
                    text
                ), "Entity offsets (offset=%s,length=%s) are outside the span of the text (%s)" % (
                    str(l.offset), str(l.length), passage.text)

                position.append((startPos, endPos))
                segments.append(text[startPos:endPos])

            entityText = " ".join(segments)

            assert entityText == a.text, "Mismatch in entity annotation between expected text (%s) and extracted text (%s) using offset info for passage with text: %s" % (
                a.text, entityText, text)

            e = kindred.Entity(entityType,
                               entityText,
                               position,
                               sourceEntityID,
                               metadata=metadata)
            entities.append(e)

        sourceEntityIDToEntity = {
            entity.sourceEntityID: entity
            for entity in entities
        }

        for r in passage.relations:
            assert isinstance(r, bioc.BioCRelation)
            relationType = r.infons['type']

            arguments = []
            for n in r.nodes:
                assert isinstance(n, bioc.BioCNode)
                arguments.append((n.role, n.refid))
            arguments = sorted(arguments)

            argNames = [argName for argName, sourceEntityID in arguments]
            sourceEntityIDs = [
                sourceEntityID for argName, sourceEntityID in arguments
            ]
            for sourceEntityID in sourceEntityIDs:
                assert sourceEntityID in sourceEntityIDToEntity, "Relation references entity %s which does not exist in BioC document id=%s" % (
                    sourceEntityID, str(document.id))

            entities = [
                sourceEntityIDToEntity[sourceEntityID]
                for sourceEntityID in sourceEntityIDs
            ]

            r = kindred.Relation(relationType, entities, argNames)
            relations.append(r)

        metadata = dict(document.infons)
        metadata.update(passage.infons)
        metadata['id'] = document.id
        relData = kindred.Document(text,
                                   entities=entities,
                                   relations=relations,
                                   metadata=metadata)
        kindredDocs.append(relData)

    return kindredDocs
Exemple #30
0
                entityTypes = set([
                    entity.entityType
                    for entity, tokenIndices in sentence.entityAnnotations
                ])
                entityInfo = [(e.entityType, e.text)
                              for e, tokenIndices in sentence.entityAnnotations
                              ]

                hasMutation = "Mutation" in entityTypes
                hasChemical = "Chemical" in entityTypes

                if hasMutation and hasChemical:
                    sentenceStart = sentence.tokens[0].startPos

                    sentenceEntities = [
                        kindred.Entity(e.entityType,
                                       e.text,
                                       [(e.position[0][0] - sentenceStart,
                                         e.position[0][1] - sentenceStart)],
                                       e.sourceEntityID,
                                       e.externalID,
                                       metadata=e.metadata)
                        for e, _ in sentence.entityAnnotations
                    ]
                    newDoc = kindred.Document(sentence.text,
                                              sentenceEntities,
                                              metadata=doc.metadata)
                    sentenceCorpus.addDocument(newDoc)

    kindred.save(sentenceCorpus, 'biocxml', args.outBioc)