def auto(): data_dict = get_data(bbc_dataset) (train_dict, test_dict) = divide_data(data_dict) le = get_labels(list(data_dict.keys())) vectorizers = create_vectorizers(train_dict) (X, y) = create_dataset_auto(test_dict, le, vectorizers) evaluate_auto(X, y, le)
def main(): data_dict = get_data(bbc_dataset) le = get_labels(list(data_dict.keys())) df = create_dataset(data_dict, le) (X_train, X_test, y_train, y_test) = split_dataset(df, 'text', 'label') vectorizer = create_and_fit_vectorizer(X_train) X_train = vectorizer.transform(X_train).todense() X_test = vectorizer.transform(X_test).todense() clf = train_svm_classifier(X_train, y_train) pickle.dump(clf, open("ch4/bbc_svm.pkl", "wb")) #clf = pickle.load(open("ch4/bbc_svm.pkl", "rb")) evaluate(clf, X_test, y_test, le) test_new_example(new_example, clf, vectorizer, le)
def main(): data_dict = get_data(bbc_dataset) (train_dict, test_dict) = divide_data(data_dict) all_training = [] all_test = [] for topic in train_dict.keys(): all_training = all_training + train_dict[topic] for topic in test_dict.keys(): all_test = all_test + train_dict[topic] vectorizer = create_vectorizer(all_training) matrix = vectorizer.transform(all_training) km = KMeans(n_clusters=5, init='k-means++', random_state=0) km.fit(matrix) predicted_data = make_predictions(test_dict, vectorizer, km) print_report(predicted_data) print_most_common_words_by_cluster(all_training, km) pickle.dump(km, open("ch4/bbc_kmeans.pkl", "wb"))
def main(): data_dict = get_data(bbc_dataset) (X, y) = create_dataset(data_dict) evaluate(X, y)
def main(): data_dict = get_data(bbc_dataset) le = get_labels(list(data_dict.keys())) df = create_dataset(data_dict, le) train_model(df, le)