Esempio n. 1
0
def test_token_repr():
    t = kindred.Token(word="hat",
                      lemma="hat",
                      partofspeech="NN",
                      startPos=0,
                      endPos=3)

    assert t.__repr__() == "hat"
Esempio n. 2
0
def test_token_str():
    t = kindred.Token(word="hat",
                      lemma="hat",
                      partofspeech="NN",
                      startPos=0,
                      endPos=3)

    assert str(t) == "hat"
Esempio n. 3
0
def test_sentence_noDependencyInfo(capfd):
    text = 'mutations cause dangerous cancer'
    tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()]

    s = kindred.Sentence(text, tokens, dependencies=[])

    nodes, edges = s.extractMinSubgraphContainingNodes([0, 2])
    out, err = capfd.readouterr()
    assert err.strip() == 'WARNING. 2 node(s) not found in dependency graph!'
    assert nodes == set()
    assert edges == set()
Esempio n. 4
0
def test_sentence_workingDependencyPath(capfd):
    text = 'lots of mutations cause dangerous cancer'
    tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()]

    s = kindred.Sentence(text,
                         tokens,
                         dependencies=[(2, 3, 'a'), (3, 5, 'b'), (4, 5, 'c')])

    nodes, edges = s.extractMinSubgraphContainingNodes([2, 5])
    assert nodes == set([2, 3, 5])
    assert edges == set([(2, 3, 'a'), (3, 5, 'b')])
Esempio n. 5
0
def test_sentence_brokenDependencyPath(capfd):
    text = 'mutations cause dangerous cancer'
    tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()]

    s = kindred.Sentence(text, tokens, dependencies=[(0, 1, 'a'), (2, 3, 'b')])

    nodes, edges = s.extractMinSubgraphContainingNodes([0, 2])
    out, err = capfd.readouterr()
    assert err.strip() == 'WARNING. No path found between nodes 0 and 2!'
    assert nodes == set()
    assert edges == set()
Esempio n. 6
0
def test_sentence_str(capfd):
    text = 'lots of mutations cause dangerous cancer'
    tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()]

    e1 = kindred.Entity('thingA', 'mutations', [(0, 1)])
    e2 = kindred.Entity('thingB', 'cancer', [(0, 1)])

    entitiesWithLocations = [(e1, [2]), (e2, [5])]

    s = kindred.Sentence(text,
                         tokens,
                         dependencies=[(2, 3, 'a'), (3, 5, 'b'), (4, 5, 'c')],
                         entitiesWithLocations=entitiesWithLocations)

    assert s.__repr__() == "lots of mutations cause dangerous cancer"
Esempio n. 7
0
def test_sentence_addEntityWithLocations(capfd):
    text = 'lots of mutations cause dangerous cancer'
    tokens = [kindred.Token(w, None, None, 0, 0) for w in text.split()]

    s = kindred.Sentence(text,
                         tokens,
                         dependencies=[(2, 3, 'a'), (3, 5, 'b'), (4, 5, 'c')])

    e1 = kindred.Entity('thingA', 'mutations', [(0, 1)])
    e2 = kindred.Entity('thingB', 'cancer', [(0, 1)])

    s.addEntityAnnotation(e1, [2])
    s.addEntityAnnotation(e2, [5])

    assert s.entityAnnotations == [(e1, [2]), (e2, [5])]
Esempio n. 8
0
    def parse(self, corpus):
        """
		Parse the corpus. Each document will be split into sentences which are then tokenized and parsed for their dependency graph. All parsed information is stored within the corpus object.
		
		:param corpus: Corpus to parse
		:type corpus: kindred.Corpus
		"""

        assert isinstance(corpus, kindred.Corpus)

        # Ignore DeprecationWarning from SortedDict which is inside IntervalTree
        import warnings
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        for d in corpus.documents:
            entityIDsToEntities = {
                entity.entityID: entity
                for entity in d.entities
            }

            denotationTree = IntervalTree()
            entityTypeLookup = {}
            for e in d.entities:
                entityTypeLookup[e.entityID] = e.entityType

                for a, b in e.position:
                    if b > a:
                        denotationTree[a:b] = e.entityID

            for sentence in self._sentencesGenerator(d.text):
                tokens = []
                for t in sentence:
                    token = kindred.Token(t.text, t.lemma_, t.pos_, t.idx,
                                          t.idx + len(t.text))
                    tokens.append(token)

                sentenceStart = tokens[0].startPos
                sentenceEnd = tokens[-1].endPos
                sentenceTxt = d.text[sentenceStart:sentenceEnd]

                indexOffset = sentence[0].i
                dependencies = []
                for t in sentence:
                    depName = t.dep_
                    dep = (t.head.i - indexOffset, t.i - indexOffset, depName)
                    dependencies.append(dep)

                entityIDsToTokenLocs = defaultdict(list)
                for i, t in enumerate(tokens):
                    entitiesOverlappingWithToken = denotationTree[t.startPos:t.
                                                                  endPos]
                    for interval in entitiesOverlappingWithToken:
                        entityID = interval.data
                        entityIDsToTokenLocs[entityID].append(i)

                sentence = kindred.Sentence(sentenceTxt, tokens, dependencies,
                                            d.sourceFilename)

                # Let's gather up the information about the "known" entities in the sentence
                for entityID, entityLocs in sorted(
                        entityIDsToTokenLocs.items()):
                    # Get the entity associated with this ID
                    e = entityIDsToEntities[entityID]
                    sentence.addEntityAnnotation(e, entityLocs)

                d.addSentence(sentence)

        corpus.parsed = True
Esempio n. 9
0
    def parse(self, corpus):
        """
		Parse the corpus. Each document will be split into sentences which are then tokenized and parsed for their dependency graph. All parsed information is stored within the corpus object.
		
		:param corpus: Corpus to parse
		:type corpus: kindred.Corpus
		"""

        assert isinstance(corpus, kindred.Corpus)

        for d in corpus.documents:
            #for doctokens in self.nlp.pipe( d.text for d in corpus.documents, batch_size=2, n_threads=1):
            entityIDsToEntities = d.getEntityIDsToEntities()

            denotationTree = IntervalTree()
            entityTypeLookup = {}
            for e in d.getEntities():
                entityTypeLookup[e.entityID] = e.entityType

                for a, b in e.position:
                    if b > a:
                        denotationTree[a:b] = e.entityID

            for sentence in self._sentencesGenerator(d.text):
                tokens = []
                for t in sentence:
                    token = kindred.Token(t.text, t.lemma_, t.pos_, t.idx,
                                          t.idx + len(t.text))
                    tokens.append(token)

                sentenceStart = tokens[0].startPos
                sentenceEnd = tokens[-1].endPos
                sentenceTxt = d.text[sentenceStart:sentenceEnd]

                indexOffset = sentence[0].i
                dependencies = []
                for t in sentence:
                    depName = t.dep_
                    dep = (t.head.i - indexOffset, t.i - indexOffset, depName)
                    dependencies.append(dep)

                # TODO: Should I filter this more or just leave it for simplicity

                entityIDsToTokenLocs = defaultdict(list)
                for i, t in enumerate(tokens):
                    entitiesOverlappingWithToken = denotationTree[t.startPos:t.
                                                                  endPos]
                    for interval in entitiesOverlappingWithToken:
                        entityID = interval.data
                        entityIDsToTokenLocs[entityID].append(i)

                # Let's gather up the information about the "known" entities in the sentence
                entitiesWithLocations = []
                for entityID, entityLocs in sorted(
                        entityIDsToTokenLocs.items()):
                    e = entityIDsToEntities[entityID]
                    entityWithLocation = (e, entityLocs)
                    entitiesWithLocations.append(entityWithLocation)

                sentence = kindred.Sentence(sentenceTxt, tokens, dependencies,
                                            entitiesWithLocations,
                                            d.getSourceFilename())
                d.addSentence(sentence)

        corpus.parsed = True
Esempio n. 10
0
    def splitIntoSentences(self):
        """
		Create a new corpus with one document for each sentence in this document.

		:return: Corpus with one document per sentence
		:rtype: kindred.Corpus
		"""

        sentenceCorpus = kindred.Corpus()

        for sentence in self.sentences:
            sentenceStart = sentence.tokens[0].startPos

            entitiesInSentence = [
                entity for entity, tokenIndices in sentence.entityAnnotations
            ]

            entityMap = OrderedDict()
            for e in entitiesInSentence:
                startPos, endPos = e.position[0]
                newPosition = [(startPos - sentenceStart,
                                endPos - sentenceStart)]
                newE = kindred.Entity(e.entityType, e.text, newPosition,
                                      e.sourceEntityID, e.externalID)
                entityMap[e] = newE

            relationsInSentence = [
                r for r in self.relations
                if all(e in entitiesInSentence for e in r.entities)
            ]
            newRelationsInSentence = []
            for r in relationsInSentence:
                newEntitiesInRelation = [entityMap[e] for e in r.entities]
                newRelation = kindred.Relation(r.relationType,
                                               newEntitiesInRelation,
                                               r.argNames, r.probability)
                newRelationsInSentence.append(newRelation)

            newEntitiesInSentence = list(entityMap.values())
            doc = kindred.Document(sentence.text, newEntitiesInSentence,
                                   newRelationsInSentence)

            newTokens = [
                kindred.Token(t.word, t.lemma, t.partofspeech,
                              t.startPos - sentenceStart,
                              t.endPos - sentenceStart)
                for t in sentence.tokens
            ]

            newSentence = kindred.Sentence(sentence.text, newTokens,
                                           sentence.dependencies,
                                           sentence.sourceFilename)
            newEntityAnnotations = [
                (entityMap[e], tokenIndices)
                for e, tokenIndices in sentence.entityAnnotations
            ]
            newSentence.entityAnnotations = newEntityAnnotations
            doc.sentences = [newSentence]

            sentenceCorpus.addDocument(doc)

        return sentenceCorpus