예제 #1
0
def SVCClassifier():
	df, bal_df = prepareDataset()
	
	print "\n##############################################################"
	print "\nFor preprocessed dataset - SVC"
	print "\n##############################################################\n"

	train_ten, test_ten, train_starsTen, test_starsTen = train_test_split(df.TEXT_REVIEW, df.STARS, test_size=0.20, random_state=42)
	
	"""Most common features vectorizer."""
	bow_vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, 
												ngram_range = (1, 1), binary = False,strip_accents='unicode', max_features=700)

	"""Feature Matrix for training and test sets."""
	bow_Feature_train = bow_vectorizer.fit_transform(train_ten)
	bow_Feature_test = bow_vectorizer.transform(test_ten)
	bow_Feature_train, bow_Feature_test
	
	bow_clf = SVC(C=1.0, kernel='linear')
	bow_clf.fit(bow_Feature_train, train_starsTen)
	bow_clf_prediction = bow_clf.predict(bow_Feature_test)
	print bow_clf_prediction

	"""BiGrams vectorizer."""
	biGram_vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
													ngram_range = (2, 2), strip_accents='unicode')

	biGramFeatMatrain = biGram_vectorizer.fit_transform(train_ten)
	biGramFeatMatest = biGram_vectorizer.transform(test_ten)
	biGramFeatMatrain, biGramFeatMatest

	biGram_clf = SVC(C=1.0, kernel='linear')
	biGram_clf.fit(biGramFeatMatrain, train_starsTen)
	biGram_clf_prediction = biGram_clf.predict(biGramFeatMatest)
	print biGram_clf_prediction

	"""TriGrams vectorizer."""
	trigram_vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
													ngram_range = (3, 3), strip_accents='unicode')

	triGramFeatMatrain = trigram_vectorizer.fit_transform(train_ten)
	triGramFeatMatest = trigram_vectorizer.transform(test_ten)
	triGramFeatMatrain, triGramFeatMatest

	triGram_clf = SVC(C=1.0, kernel='linear')
	triGram_clf.fit(triGramFeatMatrain, train_starsTen)
	triGram_clf_prediction = triGram_clf.predict(triGramFeatMatest)
	print triGram_clf_prediction	

	def SVCEvaluationsPreprocessed(name, predictions):
		target_names = ['*', '**', '***', '****', '*****']

		print "MODEL: %s" % name
		print

		print 'Precision: ' + str(metrics.precision_score(test_starsTen, predictions))
		print 'Recall: ' + str(metrics.recall_score(test_starsTen, predictions))
		print 'F1: ' + str(metrics.f1_score(test_starsTen, predictions))
		print 'Accuracy: ' + str(metrics.accuracy_score(test_starsTen, predictions))

		print
		print 'Classification Report:'
		print classification_report(test_starsTen, predictions, target_names=target_names)

	SVCEvaluationsPreprocessed('Most Common Features SVC - preprocessed', bow_clf_prediction)
	SVCEvaluationsPreprocessed('BiGram SVC - preprocessed', biGram_clf_prediction)
	SVCEvaluationsPreprocessed('TriGram SVC - preprocessed', triGram_clf_prediction)
	
	print "\n##############################################################"
	print "\nFor balanced dataset - SVC"
	print "\n##############################################################\n"

	train_bal, test_bal, train_bal_stars, test_bal_stars = train_test_split(bal_df.TEXT_REVIEW, bal_df.STARS, test_size=0.20, random_state=42)
	
	"""Most common features vectorizer."""
	bow_vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, 
												ngram_range = (1, 1), binary = False,strip_accents='unicode', max_features=700)

	"""Feature Matrix for training and test sets."""
	bow_Feature_bal_train = bow_vectorizer.fit_transform(train_bal)
	bow_Feature_bal_test = bow_vectorizer.transform(test_bal)
	bow_Feature_bal_train, bow_Feature_bal_test
	
	bow_bal_clf = SVC(C=1.0, kernel='linear')
	bow_bal_clf.fit(bow_Feature_bal_train, train_bal_stars)
	bow_bal_clf_prediction = bow_bal_clf.predict(bow_Feature_bal_test)
	print bow_bal_clf_prediction

	"""BiGrams vectorizer."""
	biGram_bal_vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
													ngram_range = (2, 2), strip_accents='unicode')

	biGram_Feature_bal_train = biGram_bal_vectorizer.fit_transform(train_bal)
	biGram_Feature_bal_test = biGram_bal_vectorizer.transform(test_bal)
	biGram_Feature_bal_train, biGram_Feature_bal_test 

	biGram_bal_clf = SVC(C=1.0, kernel='linear')
	biGram_bal_clf.fit(biGram_Feature_bal_train, train_bal_stars)
	biGram_bal_clf_prediction = biGram_bal_clf.predict(biGram_Feature_bal_test)
	print biGram_bal_clf_prediction

	"""TriGrams vectorizer."""
	trigram_bal_vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
													ngram_range = (3, 3), strip_accents='unicode')

	triGram_Feature_bal_train = trigram_bal_vectorizer.fit_transform(train_bal)
	triGram_Feature_bal_test = trigram_bal_vectorizer.transform(test_bal)
	triGram_Feature_bal_train, triGram_Feature_bal_test

	triGram_bal_clf = SVC(C=1.0, kernel='linear')
	triGram_bal_clf.fit(triGram_Feature_bal_train, train_bal_stars)
	triGram_bal_clf_prediction = triGram_bal_clf.predict(triGram_Feature_bal_test)
	print triGram_bal_clf_prediction

	def SVCEvaluationsBalanced(name, predictions):
		target_names = ['*', '**', '***', '****', '*****']

		print "MODEL: %s" % name
		print

		print 'Precision: ' + str(metrics.precision_score(test_bal_stars, predictions))
		print 'Recall: ' + str(metrics.recall_score(test_bal_stars, predictions))
		print 'F1: ' + str(metrics.f1_score(test_bal_stars, predictions))
		print 'Accuracy: ' + str(metrics.accuracy_score(test_bal_stars, predictions))

		print
		print 'Classification Report:'
		print classification_report(test_bal_stars, predictions, target_names=target_names)

	SVCEvaluationsBalanced('Most Common Features SVC - balanced', bow_bal_clf_prediction)
	SVCEvaluationsBalanced('BiGram SVC - balanced', biGram_bal_clf_prediction)
	SVCEvaluationsBalanced('TriGram SVC - balanced', triGram_bal_clf_prediction)