def main(): # Load data X_train, y_train, X_test, y_test = load_data.load_data() # Transform data into a vector of TF-IDF values count_vect = CountVectorizer(ngram_range=(1, 2)) X_train_counts = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer(use_idf=True) X_train_dtm = tfidf_transformer.fit_transform(X_train_counts) # Transform test data X_test_counts = count_vect.transform(X_test) X_test_dtm = tfidf_transformer.fit_transform(X_test_counts) # using default params clf = AdaBoostClassifier() clf.fit(X_train_dtm, y_train) y_pred_class = clf.predict(X_test_dtm) # utilities.print_misclassified_samples(X_test, y_pred_class, y_test) utilities.print_stats(y_pred_class, y_test)
def main(): # Load data X_train, y_train, X_test, y_test = load_data.load_data() # Transform data into a vector of TF-IDF values count_vect = CountVectorizer(ngram_range=(1, 2)) X_train_counts = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer(use_idf=True) X_train_dtm = tfidf_transformer.fit_transform(X_train_counts) # Transform test data X_test_counts = count_vect.transform(X_test) X_test_dtm = tfidf_transformer.fit_transform(X_test_counts) # Using optimal value of alpha obtained using GridSearchCV clf = MultinomialNB(alpha=0.02) clf.fit(X_train_dtm, y_train) y_pred_class = clf.predict(X_test_dtm) # utilities.print_misclassified_samples(X_test, y_pred_class, y_test) utilities.print_stats(y_pred_class, y_test)
def main(): # Load data X_train, y_train, X_test, y_test = load_data.load_data() # Transform data into a vector of TF-IDF values count_vect = CountVectorizer(ngram_range=(1, 2)) X_train_counts = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer(use_idf=True) X_train_dtm = tfidf_transformer.fit_transform(X_train_counts) # Transform test data X_test_counts = count_vect.transform(X_test) X_test_dtm = tfidf_transformer.fit_transform(X_test_counts) # Not optimized, probably need to test with l2 penalty also clf = LogisticRegression(penalty='l1') clf.fit(X_train_dtm, y_train) y_pred_class = clf.predict(X_test_dtm) # utilities.print_misclassified_samples(X_test, y_pred_class, y_test) utilities.print_stats(y_pred_class, y_test)
def main(): start_time = time.time() # Load data X_train, y_train, X_test, y_test = load_data.load_data() # Transform data into a vector of TF-IDF values count_vect = CountVectorizer(ngram_range=(1, 2)) X_train_counts = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer(use_idf=True) X_train_dtm = tfidf_transformer.fit_transform(X_train_counts) # Transform test data X_test_counts = count_vect.transform(X_test) X_test_dtm = tfidf_transformer.fit_transform(X_test_counts) data_load_time = time.time() - start_time # Not optimized C = 1.0 classifier_dict = { "SVC with linear kernel": svm.SVC(kernel='linear', C=C), "SVC with RBF kernel": svm.SVC(kernel='rbf', gamma=0.7, C=C), "SVC with polynomial (degree 3) kernel": svm.SVC(kernel='poly', degree=3, C=C), "LinearSVC (linear kernel)": svm.LinearSVC(C=C) } for key, clf in classifier_dict.iteritems(): start_time = time.time() clf.fit(X_train_dtm, y_train) y_pred_class = clf.predict(X_test_dtm) end_time = time.time() print key # utilities.print_misclassified_samples(X_test, y_pred_class, y_test) utilities.print_stats(y_pred_class, y_test) print "Execution time={0} sec \n".format(end_time - start_time + data_load_time)
if word in nb_dict_features.keys(): relative_word_occurence = nb_dict_features[word] class_probability *= relative_word_occurence else: class_probability *= 0 Y_dict[label] = class_probability return self.get_max_value_key(Y_dict) def predict(self, X): self.predicted_Y_values = [] n = len(X) for ii in range(0, n): X_elem = X[ii] prediction = self.classify_single_elem(X_elem) self.predicted_Y_values.append(prediction) return self.predicted_Y_values if __name__ == '__main__': X_train, Y_train, X_test, Y_test = ld.load_data() start_time = time.time() for i in xrange(len(X_train)): X_train[i] = X_train[i].split() for i in xrange(len(X_test)): X_test[i] = X_test[i].split() nbc = NaiveBayesTextClassifier() nbc.train(X_train, Y_train) y_pred_class = nbc.predict(X_test) print 'Execution time={0} sec'.format(time.time() - start_time) utilities.print_stats(y_pred_class, Y_test)