def clean_text(text):
    if not text:
        return ''

    abbreviations = identify_parenthetical_phrases()(text)
    parsers = [
        dedash(),
        titlecaps(),
        decaps_text(),
        unidecoder(),
        separate_reference(),
        url_replacement(),
        replace_acronyms(counter=abbreviations, underscore=False),
        pos_tokenizer(pre_pos_blacklist),
        token_replacement(remove=True),
        replace_from_dictionary(),
        pos_tokenizer(post_pos_blacklist)
    ]

    for parser in parsers:
        text = parser(text)

    text = remove_stopwords(text)
    text = lemmatize(text)

    return text
    def long_enough_sentence_test(self):

        caps = titlecaps(min_length=20)
        doc = "SENTENCE TOO SHORT"
        doc_right = "SENTENCE TOO SHORT"
        doc_new = caps(doc)

        assert_equal(doc_new, doc_right)
Example #3
0
    def long_enough_sentence_test(self):

        caps = titlecaps()
        doc = 'THIS SENTENCE SHORT'
        doc_right = 'THIS SENTENCE SHORT'
        doc_new = caps(doc)

        assert_equal(doc_new, doc_right)
 def setup_class(cls):
     cls.parser = titlecaps(min_length=1)
    def setup_class(cls):
        from nlpre import dedash

        cls.parser0 = titlecaps(min_length=1)
        cls.parser1 = dedash()