Beispiel #1
0
def tokenize(input_text):
    nltk.download('stopwords')
    stopwords = set(nltk.corpus.stopwords.words('english'))

    sentences = split_input_text_into_sentences(input_text)

    all_tokens_of_all_sentences = []

    for sent_index, sent in enumerate(sentences):
        tokens_in_this_sentence = []

        for word_index, word in enumerate(sent.split()):
            token = Token()
            token.original_word = word
            token.word_without_punctuations = remove_surrounding_punctuations(
                word).lower()

            if token.word_without_punctuations in stopwords:
                token.is_stopword = True
            else:
                token.is_stopword = False

            tokens_in_this_sentence.append(token)

        all_tokens_of_all_sentences.append(tokens_in_this_sentence)

    set_parts_of_speech_in_tokens(all_tokens_of_all_sentences)

    return all_tokens_of_all_sentences
                                                     token.original_word)

                output_text += replaced_word + " "

            elif token.original_word:
                output_text += token.original_word + " "

    return output_text.strip()


if __name__ == '__main__':
    rep = "excellent"
    orig = ",Amazing."

    rep = restore_case(rep, remove_surrounding_punctuations(orig))
    rep = restore_punctuations(rep, orig)

    print(rep)

    t1 = Token()
    t1.original_word = 'This'

    t2 = Token()
    t2.original_word = 'is'

    t3 = Token()
    t3.original_word = 'amazing!'
    t3.replaced_word = 'awesome'

    print(generate_output_text_from_tokens([[t1, t2, t3]]))