def main():
	feature_vector = FeatureVector(SMS_COLLECTION)
	feature_vector.data_process(sep='\t')
	messages = feature_vector.messages
	feature_vector.transformer()
	bow_transformer = feature_vector.bow_transformer
	messages_bow = feature_vector.messages_bow

	print "Describing the messages ..."
	print messages.groupby('label').describe()
	print 'sparse matrix shape:', messages_bow.shape
	print 'number of non-zeros:', messages_bow.nnz
	print 'sparsity: %.2f%%' % (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))


	print "TF_IDF normalization ... "
	tfidf_transformer = TfidfTransformer().fit(messages_bow)
	messages_tfidf = tfidf_transformer.transform(messages_bow)
	print "transform all ham/spam ... ", messages_tfidf.shape
	spam_detector = MultinomialNB().fit(messages_tfidf, messages['label'])
	all_predictions = spam_detector.predict(messages_tfidf)
	print 'accuracy', accuracy_score(messages['label'], all_predictions)
	print 'confusion matrix\n', confusion_matrix(messages['label'], all_predictions)
	print '(row=expected, col=predicted)'
	print classification_report(messages['label'], all_predictions)

	test_bow_transformer(messages, bow_transformer, tfidf_transformer, spam_detector)
Exemple #2
0
def main():
	print ("SVM Approach")
	print ("Generating messages ...")
	feature_vector = FeatureVector(SMS_COLLECTION)
	feature_vector.data_process(sep='\t')
	messages = feature_vector.messages

	print "Splitting into train and cross-validation sets ..."
	msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.2)
	print len(msg_train), len(msg_test), len(msg_train) + len(msg_test)
	print msg_train.shape, msg_test.shape

	print "Creating Pipeline for the analyzing and training ..."
	pipeline = Pipeline([
	    ('bow', CountVectorizer(analyzer=split_into_lemmas)),  # strings to token integer counts
	    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
	    ('classifier', SVC()),  # train on TF-IDF vectors w/ Naive Bayes classifier
	])

	# pipeline parameters to automatically explore and tune
	param_svm = [
	  {'classifier__C': [1, 10, 100, 1000], 'classifier__kernel': ['linear']},
	  {'classifier__C': [1, 10, 100, 1000], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf']},
	]
	print("pipeline:", [name for name, _ in pipeline.steps])
	for name, v in pipeline.steps:
		print name
		print v

	grid_svm = GridSearchCV(
	    pipeline,  	# pipeline from above
	    param_grid=param_svm,  # parameters to tune via cross validation
	    refit=True,  # fit using all data, on the best detected classifier
	    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
	    scoring='accuracy',  # what score are we optimizing?
	    cv=StratifiedKFold(label_train, n_folds=5),  # what type of cross validation to use
	)
	svm_detector = grid_svm.fit(msg_train, label_train) # find the best combination from param_svm

	print "\nScores for various cases ..."
	for i in xrange(len(svm_detector.grid_scores_)):
		print svm_detector.grid_scores_[i]


	curve = plot_learning_curve(pipeline, "accuracy vs. training set size", msg_train, label_train, cv=5)
	curve.savefig("./plots/acc-vs-trainSize_SVM.png")
	pipeline.fit(msg_train, label_train)  #trained here

	print "Score in 20% of test dataset"
	test_predictions = svm_detector.predict(msg_test)
	print 'accuracy', accuracy_score(label_test, test_predictions)
	print 'confusion matrix\n', confusion_matrix(label_test, test_predictions)
	print '(row=expected, col=predicted)'
	print classification_report(label_test, test_predictions)
def main():
    print("DecisionTree Approach")
    print("Generating messages ...")
    feature_vector = FeatureVector(SMS_COLLECTION)
    feature_vector.data_process(sep='\t')
    messages = feature_vector.messages

    print "Splitting into train and cross-validation sets ..."
    msg_train, msg_test, label_train, label_test = train_test_split(
        messages['message'], messages['label'], test_size=0.2)
    print len(msg_train), len(msg_test), len(msg_train) + len(msg_test)
    print msg_train.shape, msg_test.shape

    print "\nCreating Pipeline for the analyzing and training ..."
    dt_old = Pipeline([
        ('bow', CountVectorizer(
            analyzer=split_into_lemmas)),  # strings to token integer counts
        ('tfidf',
         TfidfTransformer()),  # integer counts to weighted TF-IDF scores
        ('classifier',
         DecisionTreeClassifier(min_samples_split=20, random_state=99)
         ),  # train on TF-IDF vectors w/ DecisionTree classifier
    ])
    print("pipeline:", [name for name, _ in dt_old.steps])
    print("-- 10-fold cross-validation , without any grid search")
    dt_old.fit(msg_train, label_train)
    scores = cross_val_score(dt_old, msg_train, label_train, cv=10)
    print "mean: {:.3f} (std: {:.3f})".format(scores.mean(), scores.std())

    from sklearn.externals.six import StringIO
    import pydot

    dot_data = StringIO()
    classes = ["ham", "spam"]
    vocab = dt_old.named_steps['bow'].get_feature_names()
    vocab1 = [v.encode('ascii', 'ignore') for v in vocab]
    # print "vocab: ", vocab1
    with open("./plots/heme.dot", "w") as f:
        export_graphviz(dt_old.named_steps['classifier'],
                        out_file=f,
                        max_depth=13,
                        feature_names=vocab1)
    print("Creating a visualization of decision tree")
    # graph = pydot.graph_from_dot_data(dot_data.getvalue())
    # graph.write_pdf("./plots/heme.pdf")

    print "\nScore in 20% of test dataset"
    test_predictions = dt_old.predict(msg_test)
    print 'accuracy', accuracy_score(label_test, test_predictions)
    print 'confusion matrix\n', confusion_matrix(label_test, test_predictions)
    print '(row=expected, col=predicted)'
    print classification_report(label_test, test_predictions)
Exemple #4
0
def main():
    print("Naive-Bayes Approach")
    print "Generating messages ..."
    feature_vector = FeatureVector(SMS_COLLECTION)
    feature_vector.data_process(sep='\t')
    messages = feature_vector.messages

    print "Splitting into train and cross-validation sets ..."
    msg_train, msg_test, label_train, label_test = train_test_split(
        messages['message'], messages['label'], test_size=0.2)
    print len(msg_train), len(msg_test), len(msg_train) + len(msg_test)
    print msg_train.shape, msg_test.shape

    print "Creating Pipeline for the analyzing and training ..."
    pipeline = Pipeline([
        ('bow', CountVectorizer(
            analyzer=split_into_lemmas)),  # strings to token integer counts
        ('tfidf',
         TfidfTransformer()),  # integer counts to weighted TF-IDF scores
        ('classifier',
         MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
    ])
    print(pipeline)
    curve = plot_learning_curve(pipeline,
                                "accuracy vs. training set size",
                                msg_train,
                                label_train,
                                cv=5)
    curve.savefig("./plots/acc-vs-trainSize_naive.png")
    pipeline.fit(msg_train, label_train)  #trained here

    print "Score in 20% of test dataset"
    test_predictions = pipeline.predict(msg_test)
    print 'accuracy', accuracy_score(label_test, test_predictions)
    print 'confusion matrix\n', confusion_matrix(label_test, test_predictions)
    print '(row=expected, col=predicted)'
    print classification_report(label_test, test_predictions)