logging.info('Derived features shape:')
        logging.info(derived_features.shape)

        features_tfidf = pandas.DataFrame(tfidfX.todense())
        # Assign column names to make it easier to print most useful features later
        features_tfidf.columns = tfidf.get_feature_names()
        features_combined = pandas.concat([features_tfidf, derived_features],
                                          axis=1)

        logging.info('Combined features shape:')
        logging.info(features_combined.shape)

        svm_object = LinearSVC()
        clf = CalibratedClassifierCV(svm_object)
        classifier = balancer.OneVsRestClassifierBalance(clf)

        logging.info('Training classifier')
        classifier.fit(features_combined.values, labels_matrix)
        logging.info('Saving TFIDF vectorizer')
        joblib.dump(tfidf, vectorizer_filename)
        logging.info('Saving binarizer')
        joblib.dump(mlb, binarizer_filename)
        logging.info('Saving model')
        joblib.dump(classifier, model_filename)

        end = time.time()
        runtime_in_seconds = end - start
        logging.info('Processing completed in {0}'.format(runtime_in_seconds))
    except Error as e:
        logging.exception(e)
            url_corpus, heading_text_corpus, content_corpus)

        logging.info('Derived features shape:')
        logging.info(derived_features.shape)

        features_tfidf = pandas.DataFrame(tfidfX.todense())
        # Assign column names to make it easier to print most useful features later
        features_tfidf.columns = tfidf.get_feature_names()
        features_combined = pandas.concat([features_tfidf, derived_features],
                                          axis=1)

        logging.info('Combined features shape:')
        logging.info(features_combined.shape)

        classifiers_to_test = [
            (balancer.OneVsRestClassifierBalance(LinearSVC()), 'SVM (Linear)'),
            (balancer.OneVsRestClassifierBalance(RandomForestClassifier()),
             'Random Forest'),
            (balancer.OneVsRestClassifierBalance(GaussianNB()), 'Naive Bayes'),
            (balancer.OneVsRestClassifierBalance(LogisticRegression()),
             'Logistic Regression'),
            (balancer.OneVsRestClassifierBalance(KNeighborsClassifier()),
             'k-Nearest Neighbour')
        ]
        for classifier, classifier_name in classifiers_to_test:
            logging.info(f'Running experiment for {classifier_name}')
            logging.info('Getting per-class scores')
            y_pred = cross_val_predict(classifier,
                                       features_combined.values,
                                       labels_matrix,
                                       cv=10)
Beispiel #3
0
            url_corpus, heading_text_corpus, content_corpus)

        logging.info('Derived features shape:')
        logging.info(derived_features.shape)

        features_tfidf = pandas.DataFrame(tfidfX.todense())
        # Assign column names to make it easier to print most useful features later
        features_tfidf.columns = tfidf.get_feature_names()
        features_combined = pandas.concat([features_tfidf, derived_features],
                                          axis=1)

        logging.info('Combined features shape:')
        logging.info(features_combined.shape)

        svm_object = LinearSVC()
        classifier = balancer.OneVsRestClassifierBalance(svm_object)

        logging.info('Computing overall results')
        scores_kappa = cross_val_score(classifier,
                                       features_combined.values,
                                       labels_matrix,
                                       cv=10,
                                       scoring=kappa_scorer()).mean()

        logging.info('Kappa : {0}'.format(scores_kappa))

        end = time.time()
        runtime_in_seconds = end - start
        logging.info('Processing completed in {0}'.format(runtime_in_seconds))
    except Error as e:
        logging.exception(e)