def clean_text(text): if not text: return '' abbreviations = identify_parenthetical_phrases()(text) parsers = [ dedash(), titlecaps(), decaps_text(), unidecoder(), separate_reference(), url_replacement(), replace_acronyms(counter=abbreviations, underscore=False), pos_tokenizer(pre_pos_blacklist), token_replacement(remove=True), replace_from_dictionary(), pos_tokenizer(post_pos_blacklist) ] for parser in parsers: text = parser(text) text = remove_stopwords(text) text = lemmatize(text) return text
def clean_text(text): no_digits = [] for s in text.split(' '): if s.isdigit(): p = inflect.engine() no_digits.append(p.number_to_words(s)) else: no_digits.append(s) text = ' '.join(no_digits) for f in [nlpre.token_replacement(), nlpre.dedash(), nlpre.separated_parenthesis(), nlpre.replace_acronyms(nlpre.identify_parenthetical_phrases()(text))]: #, nlpre.decaps_text(), nlpre.titlecaps() text = f(text) if text[-1] == '.' and no_digits[-1][-1] != '.': text = text[:-1] text = text.replace('\n', ' ') return text
def call(self, data): ABBR = identify_parenthetical_phrases()(data) parsers = [ dedash(), # titlecaps(), separate_reference(), unidecoder(), token_replacement(), url_replacement(), # replace_acronyms(ABBR, underscore=False), # separated_parenthesis(), # replace_from_dictionary(prefix="MeSH_") ] cleansed = data for f in parsers: cleansed = f(cleansed) return cleansed.replace('\n', ' ')
def setup_class(cls): cls.parser = token_replacement()
from argparse import ArgumentParser from nlpre import titlecaps, dedash, identify_parenthetical_phrases from nlpre import replace_acronyms, replace_from_dictionary from nlpre import separated_parenthesis, unidecoder, token_replacement from nlpre import url_replacement, separate_reference if __name__ == '__main__': parser = ArgumentParser() parser.add_argument( "-t", "--text", dest="text", help="The text to clean", metavar="TEXT") args = parser.parse_args() data = args.text or '' ABBR = identify_parenthetical_phrases()(data) parsers = [ dedash(), # titlecaps(), separate_reference(), unidecoder(), token_replacement(), url_replacement(), replace_acronyms(ABBR, underscore=False), separated_parenthesis(), # replace_from_dictionary(prefix="MeSH_") ] cleansed = data for f in parsers: cleansed = f(cleansed) sys.stdout.write(cleansed.replace('\n', ' '))
def remove_test(self): doc = "Working at 100% efficiency" doc_new = token_replacement(remove=True)(doc) doc_right = "Working at 100 efficiency" assert_equal(doc_new, doc_right)