def test_token_repr(): t = kindred.Token(word="hat", lemma="hat", partofspeech="NN", startPos=0, endPos=3) assert t.__repr__() == "hat"
def test_token_str(): t = kindred.Token(word="hat", lemma="hat", partofspeech="NN", startPos=0, endPos=3) assert str(t) == "hat"
def test_sentence_noDependencyInfo(capfd): text = 'mutations cause dangerous cancer' tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()] s = kindred.Sentence(text, tokens, dependencies=[]) nodes, edges = s.extractMinSubgraphContainingNodes([0, 2]) out, err = capfd.readouterr() assert err.strip() == 'WARNING. 2 node(s) not found in dependency graph!' assert nodes == set() assert edges == set()
def test_sentence_workingDependencyPath(capfd): text = 'lots of mutations cause dangerous cancer' tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()] s = kindred.Sentence(text, tokens, dependencies=[(2, 3, 'a'), (3, 5, 'b'), (4, 5, 'c')]) nodes, edges = s.extractMinSubgraphContainingNodes([2, 5]) assert nodes == set([2, 3, 5]) assert edges == set([(2, 3, 'a'), (3, 5, 'b')])
def test_sentence_brokenDependencyPath(capfd): text = 'mutations cause dangerous cancer' tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()] s = kindred.Sentence(text, tokens, dependencies=[(0, 1, 'a'), (2, 3, 'b')]) nodes, edges = s.extractMinSubgraphContainingNodes([0, 2]) out, err = capfd.readouterr() assert err.strip() == 'WARNING. No path found between nodes 0 and 2!' assert nodes == set() assert edges == set()
def test_sentence_str(capfd): text = 'lots of mutations cause dangerous cancer' tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()] e1 = kindred.Entity('thingA', 'mutations', [(0, 1)]) e2 = kindred.Entity('thingB', 'cancer', [(0, 1)]) entitiesWithLocations = [(e1, [2]), (e2, [5])] s = kindred.Sentence(text, tokens, dependencies=[(2, 3, 'a'), (3, 5, 'b'), (4, 5, 'c')], entitiesWithLocations=entitiesWithLocations) assert s.__repr__() == "lots of mutations cause dangerous cancer"
def test_sentence_addEntityWithLocations(capfd): text = 'lots of mutations cause dangerous cancer' tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()] s = kindred.Sentence(text, tokens, dependencies=[(2, 3, 'a'), (3, 5, 'b'), (4, 5, 'c')]) e1 = kindred.Entity('thingA', 'mutations', [(0, 1)]) e2 = kindred.Entity('thingB', 'cancer', [(0, 1)]) s.addEntityAnnotation(e1, [2]) s.addEntityAnnotation(e2, [5]) assert s.entityAnnotations == [(e1, [2]), (e2, [5])]
def parse(self, corpus): """ Parse the corpus. Each document will be split into sentences which are then tokenized and parsed for their dependency graph. All parsed information is stored within the corpus object. :param corpus: Corpus to parse :type corpus: kindred.Corpus """ assert isinstance(corpus, kindred.Corpus) # Ignore DeprecationWarning from SortedDict which is inside IntervalTree import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) for d in corpus.documents: entityIDsToEntities = { entity.entityID: entity for entity in d.entities } denotationTree = IntervalTree() entityTypeLookup = {} for e in d.entities: entityTypeLookup[e.entityID] = e.entityType for a, b in e.position: if b > a: denotationTree[a:b] = e.entityID for sentence in self._sentencesGenerator(d.text): tokens = [] for t in sentence: token = kindred.Token(t.text, t.lemma_, t.pos_, t.idx, t.idx + len(t.text)) tokens.append(token) sentenceStart = tokens[0].startPos sentenceEnd = tokens[-1].endPos sentenceTxt = d.text[sentenceStart:sentenceEnd] indexOffset = sentence[0].i dependencies = [] for t in sentence: depName = t.dep_ dep = (t.head.i - indexOffset, t.i - indexOffset, depName) dependencies.append(dep) entityIDsToTokenLocs = defaultdict(list) for i, t in enumerate(tokens): entitiesOverlappingWithToken = denotationTree[t.startPos:t. endPos] for interval in entitiesOverlappingWithToken: entityID = interval.data entityIDsToTokenLocs[entityID].append(i) sentence = kindred.Sentence(sentenceTxt, tokens, dependencies, d.sourceFilename) # Let's gather up the information about the "known" entities in the sentence for entityID, entityLocs in sorted( entityIDsToTokenLocs.items()): # Get the entity associated with this ID e = entityIDsToEntities[entityID] sentence.addEntityAnnotation(e, entityLocs) d.addSentence(sentence) corpus.parsed = True
def parse(self, corpus): """ Parse the corpus. Each document will be split into sentences which are then tokenized and parsed for their dependency graph. All parsed information is stored within the corpus object. :param corpus: Corpus to parse :type corpus: kindred.Corpus """ assert isinstance(corpus, kindred.Corpus) for d in corpus.documents: #for doctokens in self.nlp.pipe( d.text for d in corpus.documents, batch_size=2, n_threads=1): entityIDsToEntities = d.getEntityIDsToEntities() denotationTree = IntervalTree() entityTypeLookup = {} for e in d.getEntities(): entityTypeLookup[e.entityID] = e.entityType for a, b in e.position: if b > a: denotationTree[a:b] = e.entityID for sentence in self._sentencesGenerator(d.text): tokens = [] for t in sentence: token = kindred.Token(t.text, t.lemma_, t.pos_, t.idx, t.idx + len(t.text)) tokens.append(token) sentenceStart = tokens[0].startPos sentenceEnd = tokens[-1].endPos sentenceTxt = d.text[sentenceStart:sentenceEnd] indexOffset = sentence[0].i dependencies = [] for t in sentence: depName = t.dep_ dep = (t.head.i - indexOffset, t.i - indexOffset, depName) dependencies.append(dep) # TODO: Should I filter this more or just leave it for simplicity entityIDsToTokenLocs = defaultdict(list) for i, t in enumerate(tokens): entitiesOverlappingWithToken = denotationTree[t.startPos:t. endPos] for interval in entitiesOverlappingWithToken: entityID = interval.data entityIDsToTokenLocs[entityID].append(i) # Let's gather up the information about the "known" entities in the sentence entitiesWithLocations = [] for entityID, entityLocs in sorted( entityIDsToTokenLocs.items()): e = entityIDsToEntities[entityID] entityWithLocation = (e, entityLocs) entitiesWithLocations.append(entityWithLocation) sentence = kindred.Sentence(sentenceTxt, tokens, dependencies, entitiesWithLocations, d.getSourceFilename()) d.addSentence(sentence) corpus.parsed = True
def splitIntoSentences(self): """ Create a new corpus with one document for each sentence in this document. :return: Corpus with one document per sentence :rtype: kindred.Corpus """ sentenceCorpus = kindred.Corpus() for sentence in self.sentences: sentenceStart = sentence.tokens[0].startPos entitiesInSentence = [ entity for entity, tokenIndices in sentence.entityAnnotations ] entityMap = OrderedDict() for e in entitiesInSentence: startPos, endPos = e.position[0] newPosition = [(startPos - sentenceStart, endPos - sentenceStart)] newE = kindred.Entity(e.entityType, e.text, newPosition, e.sourceEntityID, e.externalID) entityMap[e] = newE relationsInSentence = [ r for r in self.relations if all(e in entitiesInSentence for e in r.entities) ] newRelationsInSentence = [] for r in relationsInSentence: newEntitiesInRelation = [entityMap[e] for e in r.entities] newRelation = kindred.Relation(r.relationType, newEntitiesInRelation, r.argNames, r.probability) newRelationsInSentence.append(newRelation) newEntitiesInSentence = list(entityMap.values()) doc = kindred.Document(sentence.text, newEntitiesInSentence, newRelationsInSentence) newTokens = [ kindred.Token(t.word, t.lemma, t.partofspeech, t.startPos - sentenceStart, t.endPos - sentenceStart) for t in sentence.tokens ] newSentence = kindred.Sentence(sentence.text, newTokens, sentence.dependencies, sentence.sourceFilename) newEntityAnnotations = [ (entityMap[e], tokenIndices) for e, tokenIndices in sentence.entityAnnotations ] newSentence.entityAnnotations = newEntityAnnotations doc.sentences = [newSentence] sentenceCorpus.addDocument(doc) return sentenceCorpus