def tokenize(input_text): nltk.download('stopwords') stopwords = set(nltk.corpus.stopwords.words('english')) sentences = split_input_text_into_sentences(input_text) all_tokens_of_all_sentences = [] for sent_index, sent in enumerate(sentences): tokens_in_this_sentence = [] for word_index, word in enumerate(sent.split()): token = Token() token.original_word = word token.word_without_punctuations = remove_surrounding_punctuations( word).lower() if token.word_without_punctuations in stopwords: token.is_stopword = True else: token.is_stopword = False tokens_in_this_sentence.append(token) all_tokens_of_all_sentences.append(tokens_in_this_sentence) set_parts_of_speech_in_tokens(all_tokens_of_all_sentences) return all_tokens_of_all_sentences
token.original_word) output_text += replaced_word + " " elif token.original_word: output_text += token.original_word + " " return output_text.strip() if __name__ == '__main__': rep = "excellent" orig = ",Amazing." rep = restore_case(rep, remove_surrounding_punctuations(orig)) rep = restore_punctuations(rep, orig) print(rep) t1 = Token() t1.original_word = 'This' t2 = Token() t2.original_word = 'is' t3 = Token() t3.original_word = 'amazing!' t3.replaced_word = 'awesome' print(generate_output_text_from_tokens([[t1, t2, t3]]))