# Remove all single character words
    document = re.sub(r'\s[a-zA-z]{1}\s', ' ', document)
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    # Tokenizing
    document = WordPunctTokenizer().tokenize(document)
    # Remove Stopwords
    document = [word for word in document if word not in stopset]
    # Stemming
    document = [SnowballStemmer('english').stem(t) for t in document]
    doc_length.append(len(document))
    document = ' '.join(document)
    # Remove all single characters that could have been created due to tokenization
    document = re.sub(r'\s[a-zA-z]{1}\s', ' ', document)
    # Editing some words of intrest
    document = document.replace('bp', 'bloodpressure')
    document = document.replace('blood pressure', 'bloodpressure')
    document = document.replace('ordered', 'order')
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    X.append(document)
df['incident'] = X

# Most common features after stemming pre-processing
tokens = df.incident.str.cat(sep=' ')
tokens = WordPunctTokenizer().tokenize(
    tokens)  #shows there are 1,297,146 words in this corpus
# shows how many unique words there are
unique_words = nltk.Fr___Dist(tokens)  # shows 21,116 unique words
top_words = unique_words.most_common(50)
# ploting the most common words