def clean_text(text): if not text: return '' abbreviations = identify_parenthetical_phrases()(text) parsers = [ dedash(), titlecaps(), decaps_text(), unidecoder(), separate_reference(), url_replacement(), replace_acronyms(counter=abbreviations, underscore=False), pos_tokenizer(pre_pos_blacklist), token_replacement(remove=True), replace_from_dictionary(), pos_tokenizer(post_pos_blacklist) ] for parser in parsers: text = parser(text) text = remove_stopwords(text) text = lemmatize(text) return text
def long_enough_sentence_test(self): caps = titlecaps(min_length=20) doc = "SENTENCE TOO SHORT" doc_right = "SENTENCE TOO SHORT" doc_new = caps(doc) assert_equal(doc_new, doc_right)
def long_enough_sentence_test(self): caps = titlecaps() doc = 'THIS SENTENCE SHORT' doc_right = 'THIS SENTENCE SHORT' doc_new = caps(doc) assert_equal(doc_new, doc_right)
def setup_class(cls): cls.parser = titlecaps(min_length=1)
def setup_class(cls): from nlpre import dedash cls.parser0 = titlecaps(min_length=1) cls.parser1 = dedash()