logging.info('Derived features shape:') logging.info(derived_features.shape) features_tfidf = pandas.DataFrame(tfidfX.todense()) # Assign column names to make it easier to print most useful features later features_tfidf.columns = tfidf.get_feature_names() features_combined = pandas.concat([features_tfidf, derived_features], axis=1) logging.info('Combined features shape:') logging.info(features_combined.shape) svm_object = LinearSVC() clf = CalibratedClassifierCV(svm_object) classifier = balancer.OneVsRestClassifierBalance(clf) logging.info('Training classifier') classifier.fit(features_combined.values, labels_matrix) logging.info('Saving TFIDF vectorizer') joblib.dump(tfidf, vectorizer_filename) logging.info('Saving binarizer') joblib.dump(mlb, binarizer_filename) logging.info('Saving model') joblib.dump(classifier, model_filename) end = time.time() runtime_in_seconds = end - start logging.info('Processing completed in {0}'.format(runtime_in_seconds)) except Error as e: logging.exception(e)
url_corpus, heading_text_corpus, content_corpus) logging.info('Derived features shape:') logging.info(derived_features.shape) features_tfidf = pandas.DataFrame(tfidfX.todense()) # Assign column names to make it easier to print most useful features later features_tfidf.columns = tfidf.get_feature_names() features_combined = pandas.concat([features_tfidf, derived_features], axis=1) logging.info('Combined features shape:') logging.info(features_combined.shape) classifiers_to_test = [ (balancer.OneVsRestClassifierBalance(LinearSVC()), 'SVM (Linear)'), (balancer.OneVsRestClassifierBalance(RandomForestClassifier()), 'Random Forest'), (balancer.OneVsRestClassifierBalance(GaussianNB()), 'Naive Bayes'), (balancer.OneVsRestClassifierBalance(LogisticRegression()), 'Logistic Regression'), (balancer.OneVsRestClassifierBalance(KNeighborsClassifier()), 'k-Nearest Neighbour') ] for classifier, classifier_name in classifiers_to_test: logging.info(f'Running experiment for {classifier_name}') logging.info('Getting per-class scores') y_pred = cross_val_predict(classifier, features_combined.values, labels_matrix, cv=10)
url_corpus, heading_text_corpus, content_corpus) logging.info('Derived features shape:') logging.info(derived_features.shape) features_tfidf = pandas.DataFrame(tfidfX.todense()) # Assign column names to make it easier to print most useful features later features_tfidf.columns = tfidf.get_feature_names() features_combined = pandas.concat([features_tfidf, derived_features], axis=1) logging.info('Combined features shape:') logging.info(features_combined.shape) svm_object = LinearSVC() classifier = balancer.OneVsRestClassifierBalance(svm_object) logging.info('Computing overall results') scores_kappa = cross_val_score(classifier, features_combined.values, labels_matrix, cv=10, scoring=kappa_scorer()).mean() logging.info('Kappa : {0}'.format(scores_kappa)) end = time.time() runtime_in_seconds = end - start logging.info('Processing completed in {0}'.format(runtime_in_seconds)) except Error as e: logging.exception(e)