def normalize_content(document): sentences = sent_tokenize(document) sentences = [ normalize_sentence(sentence) for sentence in sentences if StringUtils.is_not_empty(sentence) ] return ' '.join(sentences)
def normalize_sentence(sentence): sentence = sentence.lower().replace('/', ' ') words = word_tokenize(sentence) words = [ normalize_word(word) for word in words if StringUtils.is_not_empty(word) ] sentence = ' '.join(words) sentence = add_dot_at_end_of_line(sentence.strip(' \t\n\r')) return sentence