def main(): feature_vector = FeatureVector(SMS_COLLECTION) feature_vector.data_process(sep='\t') messages = feature_vector.messages feature_vector.transformer() bow_transformer = feature_vector.bow_transformer messages_bow = feature_vector.messages_bow print "Describing the messages ..." print messages.groupby('label').describe() print 'sparse matrix shape:', messages_bow.shape print 'number of non-zeros:', messages_bow.nnz print 'sparsity: %.2f%%' % (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1])) print "TF_IDF normalization ... " tfidf_transformer = TfidfTransformer().fit(messages_bow) messages_tfidf = tfidf_transformer.transform(messages_bow) print "transform all ham/spam ... ", messages_tfidf.shape spam_detector = MultinomialNB().fit(messages_tfidf, messages['label']) all_predictions = spam_detector.predict(messages_tfidf) print 'accuracy', accuracy_score(messages['label'], all_predictions) print 'confusion matrix\n', confusion_matrix(messages['label'], all_predictions) print '(row=expected, col=predicted)' print classification_report(messages['label'], all_predictions) test_bow_transformer(messages, bow_transformer, tfidf_transformer, spam_detector)
def main(): print ("SVM Approach") print ("Generating messages ...") feature_vector = FeatureVector(SMS_COLLECTION) feature_vector.data_process(sep='\t') messages = feature_vector.messages print "Splitting into train and cross-validation sets ..." msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.2) print len(msg_train), len(msg_test), len(msg_train) + len(msg_test) print msg_train.shape, msg_test.shape print "Creating Pipeline for the analyzing and training ..." pipeline = Pipeline([ ('bow', CountVectorizer(analyzer=split_into_lemmas)), # strings to token integer counts ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores ('classifier', SVC()), # train on TF-IDF vectors w/ Naive Bayes classifier ]) # pipeline parameters to automatically explore and tune param_svm = [ {'classifier__C': [1, 10, 100, 1000], 'classifier__kernel': ['linear']}, {'classifier__C': [1, 10, 100, 1000], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf']}, ] print("pipeline:", [name for name, _ in pipeline.steps]) for name, v in pipeline.steps: print name print v grid_svm = GridSearchCV( pipeline, # pipeline from above param_grid=param_svm, # parameters to tune via cross validation refit=True, # fit using all data, on the best detected classifier n_jobs=-1, # number of cores to use for parallelization; -1 for "all cores" scoring='accuracy', # what score are we optimizing? cv=StratifiedKFold(label_train, n_folds=5), # what type of cross validation to use ) svm_detector = grid_svm.fit(msg_train, label_train) # find the best combination from param_svm print "\nScores for various cases ..." for i in xrange(len(svm_detector.grid_scores_)): print svm_detector.grid_scores_[i] curve = plot_learning_curve(pipeline, "accuracy vs. training set size", msg_train, label_train, cv=5) curve.savefig("./plots/acc-vs-trainSize_SVM.png") pipeline.fit(msg_train, label_train) #trained here print "Score in 20% of test dataset" test_predictions = svm_detector.predict(msg_test) print 'accuracy', accuracy_score(label_test, test_predictions) print 'confusion matrix\n', confusion_matrix(label_test, test_predictions) print '(row=expected, col=predicted)' print classification_report(label_test, test_predictions)
def main(): print("DecisionTree Approach") print("Generating messages ...") feature_vector = FeatureVector(SMS_COLLECTION) feature_vector.data_process(sep='\t') messages = feature_vector.messages print "Splitting into train and cross-validation sets ..." msg_train, msg_test, label_train, label_test = train_test_split( messages['message'], messages['label'], test_size=0.2) print len(msg_train), len(msg_test), len(msg_train) + len(msg_test) print msg_train.shape, msg_test.shape print "\nCreating Pipeline for the analyzing and training ..." dt_old = Pipeline([ ('bow', CountVectorizer( analyzer=split_into_lemmas)), # strings to token integer counts ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores ('classifier', DecisionTreeClassifier(min_samples_split=20, random_state=99) ), # train on TF-IDF vectors w/ DecisionTree classifier ]) print("pipeline:", [name for name, _ in dt_old.steps]) print("-- 10-fold cross-validation , without any grid search") dt_old.fit(msg_train, label_train) scores = cross_val_score(dt_old, msg_train, label_train, cv=10) print "mean: {:.3f} (std: {:.3f})".format(scores.mean(), scores.std()) from sklearn.externals.six import StringIO import pydot dot_data = StringIO() classes = ["ham", "spam"] vocab = dt_old.named_steps['bow'].get_feature_names() vocab1 = [v.encode('ascii', 'ignore') for v in vocab] # print "vocab: ", vocab1 with open("./plots/heme.dot", "w") as f: export_graphviz(dt_old.named_steps['classifier'], out_file=f, max_depth=13, feature_names=vocab1) print("Creating a visualization of decision tree") # graph = pydot.graph_from_dot_data(dot_data.getvalue()) # graph.write_pdf("./plots/heme.pdf") print "\nScore in 20% of test dataset" test_predictions = dt_old.predict(msg_test) print 'accuracy', accuracy_score(label_test, test_predictions) print 'confusion matrix\n', confusion_matrix(label_test, test_predictions) print '(row=expected, col=predicted)' print classification_report(label_test, test_predictions)
def main(): print("Naive-Bayes Approach") print "Generating messages ..." feature_vector = FeatureVector(SMS_COLLECTION) feature_vector.data_process(sep='\t') messages = feature_vector.messages print "Splitting into train and cross-validation sets ..." msg_train, msg_test, label_train, label_test = train_test_split( messages['message'], messages['label'], test_size=0.2) print len(msg_train), len(msg_test), len(msg_train) + len(msg_test) print msg_train.shape, msg_test.shape print "Creating Pipeline for the analyzing and training ..." pipeline = Pipeline([ ('bow', CountVectorizer( analyzer=split_into_lemmas)), # strings to token integer counts ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores ('classifier', MultinomialNB()), # train on TF-IDF vectors w/ Naive Bayes classifier ]) print(pipeline) curve = plot_learning_curve(pipeline, "accuracy vs. training set size", msg_train, label_train, cv=5) curve.savefig("./plots/acc-vs-trainSize_naive.png") pipeline.fit(msg_train, label_train) #trained here print "Score in 20% of test dataset" test_predictions = pipeline.predict(msg_test) print 'accuracy', accuracy_score(label_test, test_predictions) print 'confusion matrix\n', confusion_matrix(label_test, test_predictions) print '(row=expected, col=predicted)' print classification_report(label_test, test_predictions)