#NOrmalize text
    #for df in train, test:
    #    df["comment_text"] = normalizeString(df["comment_text"])
    #stemmer = PorterStemmer()
    #def custom_tokenize(text):
    #    tokens = wordpunct_tokenize(text)
    #    tokens = [stemmer.stem(token) for token in tokens]
    #    return tokens

    #Tokenize comments    S
    tok = Tokenizer(max_features=MAX_FEATURES,
                    max_len=MAX_LEN,
                    tokenizer=wordpunct_tokenize)
    X = tok.fit_transform(
        pd.concat([
            train_preproc["comment_text"].astype(str).fillna("na"),
            test_preproc["comment_text"].astype(str).fillna("na")
        ]))
    X_train = X[:len(train), :]
    X_test = X[len(train):, :]

    print(X_train.shape, X_test.shape)
    print("<+++++++>")
    print("Total words found by tokenizer in train and test are {}".format(
        len(tok.doc_freq)))
    print("Top 10 words in vocab are {}".format(tok.doc_freq.most_common(10)))
    print("Last 10 words to be used vocab with their freq are {}".format(
        tok.doc_freq.most_common(MAX_FEATURES)[-10:]))

    #Initialize embeddings
    embedding_matrix, oov_list = initialize_embeddings(EMBEDDING_FILE, tok)
예제 #2
0
import numpy as np

if __name__ == "__main__":
    # Stop words come from the top few dozens frequent tokens identified by Tokenizer
    # All should be grammatical constructs with little semantic meaning
    stop_words_custom = [
        'a', 'and', 'the', 'is', 'am', 'are', 'he', 'she', 'it', 'to', 'an'
    ]

    #priors, training_documents, training_labels = generate_training_samples(sys.argv[1])
    priors, training_documents, training_labels = generate_training_samples(
        "op_spam_training_data/")

    # Build Tokenizer and turn training documents into integer tokens
    tok = Tokenizer(num_tokens=None, stop_words=stop_words_custom)
    tokenized_train = tok.fit_transform(training_documents)

    # Convert training samples and labels to numpy arrays
    X = list_to_numpy(tokenized_train, tok)
    y = np.asarray(training_labels)

    # Split off developmental data
    # Fixed random_state = 42 for DEBUG purposes
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=49)
    # Fit model on training data
    nb_clf = MultinomialNB()
    nb_clf.fit(X_train, y_train, alpha=0.9)