#NOrmalize text #for df in train, test: # df["comment_text"] = normalizeString(df["comment_text"]) #stemmer = PorterStemmer() #def custom_tokenize(text): # tokens = wordpunct_tokenize(text) # tokens = [stemmer.stem(token) for token in tokens] # return tokens #Tokenize comments S tok = Tokenizer(max_features=MAX_FEATURES, max_len=MAX_LEN, tokenizer=wordpunct_tokenize) X = tok.fit_transform( pd.concat([ train_preproc["comment_text"].astype(str).fillna("na"), test_preproc["comment_text"].astype(str).fillna("na") ])) X_train = X[:len(train), :] X_test = X[len(train):, :] print(X_train.shape, X_test.shape) print("<+++++++>") print("Total words found by tokenizer in train and test are {}".format( len(tok.doc_freq))) print("Top 10 words in vocab are {}".format(tok.doc_freq.most_common(10))) print("Last 10 words to be used vocab with their freq are {}".format( tok.doc_freq.most_common(MAX_FEATURES)[-10:])) #Initialize embeddings embedding_matrix, oov_list = initialize_embeddings(EMBEDDING_FILE, tok)
import numpy as np if __name__ == "__main__": # Stop words come from the top few dozens frequent tokens identified by Tokenizer # All should be grammatical constructs with little semantic meaning stop_words_custom = [ 'a', 'and', 'the', 'is', 'am', 'are', 'he', 'she', 'it', 'to', 'an' ] #priors, training_documents, training_labels = generate_training_samples(sys.argv[1]) priors, training_documents, training_labels = generate_training_samples( "op_spam_training_data/") # Build Tokenizer and turn training documents into integer tokens tok = Tokenizer(num_tokens=None, stop_words=stop_words_custom) tokenized_train = tok.fit_transform(training_documents) # Convert training samples and labels to numpy arrays X = list_to_numpy(tokenized_train, tok) y = np.asarray(training_labels) # Split off developmental data # Fixed random_state = 42 for DEBUG purposes X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=49) # Fit model on training data nb_clf = MultinomialNB() nb_clf.fit(X_train, y_train, alpha=0.9)