def tokenize(text):
    '''clean and tokenize input messages'''

    # replace urls with placeholder
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    # tokenize text
    tokens = word_tokenize(text)

    # process text further in loop
    clean_tokens = []
    for tok in tokens:

        # Remove stop words
        if tok in stopwords.words("english"):
            continue

        # Reduce words to their stems
        tok = PorterStemmer().stem(tok)

        # Reduce words to their root form
        lemmatizer = WordNetLemmatizer()
        tok = lemmatizer.lemmatize(tok).lower().strip()

        # append to list
        clean_tokens.append(tok)

    # Remove all non alphabet characters
    clean_tokens = [tok for tok in clean_tokens if tok.isalpha()]

    # return clean and tokenized text
    return clean_tokens
def tokenize(text):
    '''tokenize input messages'''
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        # Remove stop words
        if tok in stopwords.words("english"):
            continue

        # Reduce words to their stems
        tok = PorterStemmer().stem(tok)

        # Reduce words to their root form
        tok = lemmatizer.lemmatize(tok).lower().strip()

        clean_tokens.append(tok)

    # Remove all non alphabet characters
    clean_tokens = [tok for tok in clean_tokens if tok.isalpha()]
    return clean_tokens