def nltk_movie_review_accuracy(num_iterations): """ Try different number of features, and optimize number of training iterations.""" return 0, 0 # TODO: Exercise 4: remove line (training_documents, dev_documents, test_documents) = load_reviews() best_development_accuracy = 0.0 best_num_features = 0 best_classifier = None best_feature_set = None # Test different numbers of features. for n in [100, 1000, 10000]: print("Training with %d features..." % n) # Training set training_set = Dataset.from_document_collection(training_documents, num_features=n) # Development set development_set = Dataset.from_document_collection( dev_documents, feature_set=training_set.feature_set) # Train classifier classifier = PerceptronClassifier.from_dataset(training_set) pass # TODO: Exercise 4: train the classifier # Accuracies of classifier with n features train_accuracy = classifier.test_accuracy(training_set) development_accuracy = classifier.test_accuracy(development_set) if development_accuracy > best_development_accuracy: best_development_accuracy = development_accuracy best_num_features = n best_classifier = classifier.copy() best_feature_set = training_set.feature_set print( "Best classifier with %d features: \t Train Accuracy: %.4f \t Dev Accuracy: %.4f" % (n, train_accuracy, best_development_accuracy)) print("Best number of features: %d " % best_num_features) print("Top features for positive class:") print(best_classifier.features_for_class(True)) print("Top features for negative class:") print(best_classifier.features_for_class(False)) # Compute test score for best setting. testing_set = Dataset.from_document_collection( test_documents, feature_set=best_feature_set) testing_accuracy = best_classifier.test_accuracy(testing_set) print("Test score for best setting: %.4f" % testing_accuracy) return best_development_accuracy, testing_accuracy
def setUp(self): small_collection_train_1 = DocumentCollection.from_document_list( train_docs_1) self.small_dataset_train_1 = Dataset.from_document_collection( small_collection_train_1) small_collection_train_2 = DocumentCollection.from_document_list( train_docs_2) self.small_dataset_train_2 = Dataset.from_document_collection( small_collection_train_2) small_collection_dev = DocumentCollection.from_document_list(dev_docs) self.small_dataset_dev = Dataset.from_document_collection( small_collection_dev, feature_set=self.small_dataset_train_1.feature_set) small_collection_pred_1 = DocumentCollection.from_document_list( pred_docs_1) self.small_dataset_pred_test_1 = Dataset.from_document_collection( small_collection_pred_1, feature_set=self.small_dataset_train_1.feature_set) small_collection_pred_2 = DocumentCollection.from_document_list( pred_docs_2) self.small_dataset_pred_test_2 = Dataset.from_document_collection( small_collection_pred_2, feature_set=self.small_dataset_train_1.feature_set) small_collection_no_update = DocumentCollection.from_document_list( no_update_docs) self.small_instance_list_no_update = [ DataInstance.from_document(doc, self.small_dataset_train_1.feature_set) for doc in small_collection_no_update.all_documents() ] small_collection_do_update = DocumentCollection.from_document_list( do_update_docs) self.small_instance_list_do_update = [ DataInstance.from_document(doc, self.small_dataset_train_1.feature_set) for doc in small_collection_do_update.all_documents() ]