Example #1
0
    def test_prediction_pipeline(self):
        """Test pipeline prediction."""
        train_data, _ = pipelines.extract_labeled_features(
            self.test_data,
            nvd_attributes=['project', 'description'],
            nltk_feed_attributes=['description'])

        clf = classifiers.NBClassifier().fit(train_data)

        pred_data = [
            'Sample project name prediction',
            'Sample project name prediction',
            'Sample project name prediction',
        ]

        pred_pipeline = pipelines.get_prediction_pipeline(clf, )

        n_candidates = 3
        predictions = pred_pipeline.fit_predict(pred_data,
                                                classifier__n=n_candidates,
                                                classifier__sample=True)

        self.assertIsNotNone(predictions)
        self.assertEqual(predictions.shape[1], n_candidates)
        self.assertEqual(predictions.shape[-1], 2)  # candidate - proba
    def test_evaluation(self):
        """Test evaluation of extracted features"""
        test_data = _get_test_data()
        featuresets, _ = pipelines.extract_labeled_features(
            data=test_data,
            attributes=['description'],
        )

        clf = classifiers.NBClassifier().fit(featuresets)
        self.assertIsNotNone(clf)

        # evaluation == 0.0
        zero_labels = [None] * len(featuresets)
        score = clf.evaluate(featuresets, zero_labels, sample=True)

        self.assertIsNotNone(score)
        self.assertEqual(score, 0.0)

        score = classifiers.cross_validate(clf,
                                           featuresets,
                                           zero_labels,
                                           sample=True)

        self.assertIsNotNone(score)
        self.assertEqual(score.mean, 0.0)
Example #3
0
    def test_extract_labeled_features(self):
        """Test labeled feature extraction."""
        featuresets, labels = pipelines.extract_labeled_features(
            data=self.test_data,
            nvd_attributes=['project', 'description'],
            nltk_feed_attributes=['description'])

        self.assertTrue(np.any(featuresets))
        self.assertTrue(any(labels))
    def test_extract_labeled_features(self):
        """Test labeled feature extraction."""
        test_data = _get_test_data()
        featuresets, labels = pipelines.extract_labeled_features(
            data=test_data,
            attributes=['description'],
        )

        self.assertTrue(any(featuresets))
        self.assertTrue(any(labels))
Example #5
0
def _export_classifier():
    """Set up for unit tests by exporting classifier."""
    raw_data = _get_test_data()

    data, _ = pipelines.extract_labeled_features(
        data=raw_data,
        nvd_attributes=['project', 'description'],
        nltk_feed_attributes=['description'])

    classifier = classifiers.NBClassifier()
    classifier = classifier.fit(data)

    tmp_dir = tempfile.mkdtemp(prefix='test_export_')

    pickle_path = classifier.export(export_dir=tmp_dir)

    return pickle_path
    def setUpClass(cls):
        """Return preprocessed extracted labeled features."""
        from nvdlib.nvd import NVD

        feed = NVD.from_feeds(feed_names=['recent'])
        # download and update
        feed.update()

        # get the sample cves
        __cve_iter = feed.cves()

        data = list(__cve_iter)

        data, labels = extract_labeled_features(
            data=data,
            nvd_attributes=['project', 'description'],
            nltk_feed_attributes=['description'])

        cls.data, cls.labels = data, labels
Example #7
0
def _get_extracted_test_data():
    """Return preprocessed data.

    Note: used for tests only."""
    from nvdlib.nvd import NVD

    feed = NVD.from_feeds(feed_names=['recent'])
    # download and update
    feed.update()

    # get the sample cves
    __cve_iter = feed.cves()
    __records = 500

    data = [next(__cve_iter) for _ in range(__records)]

    data, labels = extract_labeled_features(data=data,
                                            attributes=['description'])

    return data, labels
def _get_extracted_test_data():
    """Return preprocessed data.

    Note: used for tests only.
    """
    from nvdlib.nvd import NVD

    feed = NVD.from_feeds(feed_names=['recent'])
    # download and update
    feed.update()

    # get the sample cves
    __cve_iter = feed.cves()

    data = list(__cve_iter)

    data, labels = extract_labeled_features(
        data=data, nvd_attributes=['project', 'description'])

    return data, labels
def main():
    args = __parser.parse_args()

    if args.csv:
        # TODO
        raise NotImplementedError("The feature has not been implemented yet."
                                  " Sorry for the inconvenience.")
    else:
        print("Getting NVD Feed...")
        feed = NVD.from_feeds(feed_names=args.nvd_feeds)
        feed.update()
        data = feed.cves()  # generator

    # transform and transform the data with the pre-processing pipeline
    print("Preprocessing...")
    features, labels = pipelines.extract_labeled_features(
        data=data,
        feature_hooks=FEATURE_HOOKS,
        attributes=['description'],
    )
    print("Preprocessing done.")

    if not data:
        print("No data left after preprocessing. Check the data provided"
              " or modify preprocessing pipeline.", file=sys.stderr)
        exit(1)

    path_to_classifier = os.path.join(os.getcwd(), args.path_to_classifier)
    classifier = classifiers.NBClassifier.restore(path_to_classifier)

    # noinspection PyPep8Naming
    X_train, X_test, y_train, y_test = train_test_split(  # pylint: disable=invalid-name
        features, labels,
        test_size=0.2,
        random_state=np.random.randint(0, 100),
        shuffle=True
    )

    if args.eval:
        score = classifier.evaluate(X_test, y_test, sample=True, n=args.num_candidates)

        print("Evaluation accuracy:", score)

    if args.cross_validate:
        score = classifiers.cross_validate(
            classifier,
            X_train,
            y_train,
            sample=True,
            n=args.num_candidates,
            folds=args.cross_validation_folds,
            shuffle=True
        )

        print("Cross-validation results:")
        print("-------------------------")
        print("\tIntermediate results:\n")
        print(
            "\n".join("\t\tFold {}: {}".format(fold, np.round(value, 2))
                      for fold, value in enumerate(score.values))
        )
        print("\tAccuracy: %.2f (+/- %.4f)" % (np.round(score.mean, 2), np.round(score.std * 2, 4)))