Esempio n. 1
0
def two_incomplete_test():
    string = "This should be two sentences.  Second one incomplete"
    gold = ["This should be two sentences.", "Second one incomplete"]

    doc = nlp.sent_tokenize(string)
    sents = [s for s in doc.sents()]

    for sent, gold_sent in zip(sents, gold):
        assert str(sent) == str(gold_sent)
    assert len(sents) == len(gold)
Esempio n. 2
0
def nothing_test():
    string = "  "
    gold = []

    doc = nlp.sent_tokenize(string)
    sents = [s for s in doc.sents()]

    for sent, gold_sent in zip(sents, gold):
        assert str(sent) == str(gold_sent)
    assert len(sents) == len(gold)
Esempio n. 3
0
def one_incomplete_test():
    string = "One incomplete sentence"
    gold = ["One incomplete sentence"]

    doc = nlp.sent_tokenize(string)
    sents = [s for s in doc.sents()]

    for sent, gold_sent in zip(sents, gold):
        assert str(sent) == str(gold_sent)
    assert len(sents) == len(gold)
Esempio n. 4
0
def two_long_test():
    string = "This should be two sentences!!!?!!  There is a split."
    gold = ["This should be two sentences!!!?!!", "There is a split."]

    doc = nlp.sent_tokenize(string)
    sents = [s for s in doc.sents()]

    for sent, gold_sent in zip(sents, gold):
        assert str(sent) == str(gold_sent)
    assert len(sents) == len(gold)
Esempio n. 5
0
def no_split_test():
    string = "This should only be one sentence."
    gold = ["This should only be one sentence."]

    doc = nlp.sent_tokenize(string)
    sents = [s for s in doc.sents()]

    for sent, gold_sent in zip(sents, gold):
        assert str(sent) == str(gold_sent)
    assert len(sents) == len(gold)
Esempio n. 6
0
def mr_test():
    string = "Mr. White got a loaf of bread"
    gold = ["Mr. White got a loaf of bread"]

    doc = nlp.sent_tokenize(string)
    sents = [s for s in doc.sents()]

    for sent, gold_sent in zip(sents, gold):
        assert str(sent) == str(gold_sent)
    assert len(sents) == len(gold)
Esempio n. 7
0
def funny_test():
    string = "'') Funny stuff joined on."
    gold = ["'') Funny stuff joined on."]

    doc = nlp.sent_tokenize(string)
    sents = [s for s in doc.sents()]

    for sent, gold_sent in zip(sents, gold):
        assert str(sent) == str(gold_sent)
    assert len(sents) == len(gold)
Esempio n. 8
0
def parens_quotes_test():
    string = "(Break after a parenthesis.)  (Or after \"quoted stuff!\")"
    gold = ["(Break after a parenthesis.)", "(Or after \"quoted stuff!\")"]

    doc = nlp.sent_tokenize(string)
    sents = [s for s in doc.sents()]

    for sent, gold_sent in zip(sents, gold):
        print(sent)
        assert str(sent) == str(gold_sent)
    assert len(sents) == len(gold)
Esempio n. 9
0
    def run(self):

        from nlp import sent_tokenize, word_tokenize, lemmatize
        fb = self['fullBody']

        possibilities = sent_tokenize(fb)

        for i, p in enumerate(possibilities):
            words = word_tokenize(fb)
            words = map(lemmatize, words)
            if len([x for x in words if x == 'die']):
                return p