def pickle_labeled_sentiment_dataset(): dataset_with_label = get_labeled_sentiment_dataset() pickle_as = os.path.join(get_project_root(), 'data/pickles/sentiment_data_labeled_v1.pickle') with open(pickle_as, 'wb') as f: pickle.dump(dataset_with_label, f)
def get_labeled_sentiment_data(): """ Load pre-pickled sentiment data with labels. """ pickle_file = os.path.join(utils.get_project_root(), 'data/pickles/sentiment_data_labeled.pickle') with open(pickle_file, 'rb') as f: return pickle.load(f)
def pickle_word_features(num_feat=5000): default_training_data = utils.get_training_data_path() word_dict = get_word_freq_dict(default_training_data) features = list(word_dict.keys())[:num_feat] pickle_as = os.path.join(get_project_root(), 'data/pickles/word_features_5k.pickle') with open(pickle_as, 'wb') as f: pickle.dump(features, f)
def pickle_feature_sets(): dataset_with_label = get_labeled_sentiment_dataset() # a list of tuples: e.g. [({'good': True, 'silly': False, ...}, 'pos'), # ({'good': False, 'silly': True, ...}, 'neg'), (...)] feature_sets = [(find_features(text), sentiment) for (text, sentiment) in dataset_with_label] random.shuffle(feature_sets) pickle_as = os.path.join(get_project_root(), 'data/pickles/feature_sets.pickle') with open(pickle_as, 'wb') as f: pickle.dump(feature_sets, f)
def get_word_features(): """ Returns the top word features in training data. Use the existing pickled data. :param num_feat: number of features to consider, defaulted to 5k. :return: a list of top features (words) """ pickle_file = os.path.join(utils.get_project_root(), 'data/pickles/word_features_5k.pickle') with open(pickle_file, 'rb') as f: return pickle.load(f)
def train_mnb_clf(training_set, testing_set): """ accuracy: 73.28 """ mnb_classifier = SklearnClassifier(MultinomialNB()) mnb_classifier.train(training_set) print("Multinomial NB Classifier accuracy:", (classify.accuracy(mnb_classifier, testing_set)) * 100) pickle_as = os.path.join(utils.get_project_root(), 'data/classifiers/mnb_classifier_5k.pickle') with open(pickle_as, 'wb') as f: pickle.dump(mnb_classifier, f)
def get_feature_sets(): """ Load pre-pickled feature sets data. Feature sets structure: a list of tuples: e.g. [({'good': True, 'silly': False, ...}, 'pos'), ({'good': False, 'silly': True, ...}, 'neg'), (...)] :return: Feature sets. """ pickle_file = os.path.join(utils.get_project_root(), 'data/pickles/feature_sets.pickle') with open(pickle_file, 'rb') as f: return pickle.load(f)
def train_linear_svc_clf(training_set, testing_set): """ accuracy: 72.01 """ linear_svc_classifier = SklearnClassifier(LinearSVC()) linear_svc_classifier.train(training_set) print("LinearSVC Classifier accuracy:", (classify.accuracy(linear_svc_classifier, testing_set)) * 100) pickle_as = os.path.join( utils.get_project_root(), 'data/classifiers/linear_svc_classifier_5k.pickle') with open(pickle_as, 'wb') as f: pickle.dump(linear_svc_classifier, f)
def train_bernoulli_nb_clf(training_set, testing_set): """ accuracy: 74.64 """ bernoulli_nb_classifier = SklearnClassifier(BernoulliNB()) bernoulli_nb_classifier.train(training_set) print("Bernoulli NB Classifier accuracy:", (classify.accuracy(bernoulli_nb_classifier, testing_set)) * 100) pickle_as = os.path.join( utils.get_project_root(), 'data/classifiers/bernoulli_nb_classifier_5k.pickle') with open(pickle_as, 'wb') as f: pickle.dump(bernoulli_nb_classifier, f)
def train_naive_bayes_clf(training_set, testing_set): """ accuracy: 74.26 """ naive_bayes_classifier = NaiveBayesClassifier.train(training_set) print('Naive Bayes model accuracy:', (classify.accuracy(naive_bayes_classifier, testing_set)) * 100) naive_bayes_classifier.show_most_informative_features(15) pickle_as = os.path.join(utils.get_project_root(), 'data/classifiers/naive_bayes_5k.pickle') with open(pickle_as, 'wb') as f: pickle.dump(naive_bayes_classifier, f)
def train_logistic_regression_clf(training_set, testing_set): """ accuracy: 74.59 """ logistic_regression_classifier = SklearnClassifier(LogisticRegression()) logistic_regression_classifier.train(training_set) print('Logistic Regression Classifier accuracy:', (classify.accuracy(logistic_regression_classifier, testing_set)) * 100) pickle_as = os.path.join( utils.get_project_root(), 'data/classifiers/logistic_regression_classifier_5k.pickle') with open(pickle_as, 'wb') as f: pickle.dump(logistic_regression_classifier, f)
def load_naive_bayes_clf(): pickle_clf = os.path.join(utils.get_project_root(), 'data/classifiers/naive_bayes_5k.pickle') with open(pickle_clf, 'rb') as f: return pickle.load(f)
def load_linear_svc_clf(): pickle_clf = os.path.join( utils.get_project_root(), 'data/classifiers/linear_svc_classifier_5k.pickle') with open(pickle_clf, 'rb') as f: return pickle.load(f)
def load_logistic_regression_clf(): pickle_clf = os.path.join( utils.get_project_root(), 'data/classifiers/logistic_regression_classifier_5k.pickle') with open(pickle_clf, 'rb') as f: return pickle.load(f)
def load_bernoulli_nb_clf(): pickle_clf = os.path.join( utils.get_project_root(), 'data/classifiers/bernoulli_nb_classifier_5k.pickle') with open(pickle_clf, 'rb') as f: return pickle.load(f)