def _get_preprocessed_test_data():
    """Return preprocessed data.

    Note: used for tests only."""
    from nvdlib.nvd import NVD

    feed = NVD.from_feeds(feed_names=['recent'])
    # download and update
    feed.update()

    # get the sample cves
    __cve_iter = feed.cves()
    __records = 500

    data = [next(__cve_iter)
            for _ in range(__records)]  # only first n to speed up tests
    pipeline = get_preprocessing_pipeline()
    steps, preps = list(zip(*pipeline.steps))

    # set up fit parameters (see sklearn fit_params notation)
    fit_params = {
        "%s__feed_attributes" % steps[2]: ['description'],
        "%s__output_attributes" % steps[2]: ['label']
    }

    prep_data = pipeline.fit_transform(X=data, **fit_params)

    return prep_data
def _get_preprocessed_test_data():
    """Return preprocessed data.

    Note: used for tests only.
    """
    feed = NVD.from_feeds(feed_names=['recent'])
    # download and update
    feed.update()

    # get the sample cves
    __cve_iter = feed.cves()
    __records = 500

    data = list()
    for i, cve in enumerate(__cve_iter):
        if i >= __records:
            break
        data.append(cve)

    pipeline = get_preprocessing_pipeline(
        nvd_attributes=['project', 'description'])
    steps, preps = list(zip(*pipeline.steps))

    # set up fit parameters (see sklearn fit_params notation)
    fit_params = {
        "%s__feed_attributes" % steps[2]: ['description'],
        "%s__output_attributes" % steps[2]: ['label']
    }

    prep_data = pipeline.fit_transform(X=data, **fit_params)

    return prep_data
Beispiel #3
0
    def setUpClass(cls):
        """Return preprocessed data."""
        feed = NVD.from_feeds(feed_names=['recent'])
        # download and update
        feed.update()

        # get the sample cves
        __cve_iter = feed.cves()
        __records = 500

        data = [next(__cve_iter) for _ in range(__records)]  # only first n to speed up tests
        pipeline = get_preprocessing_pipeline(
            nvd_attributes=['project', 'description'],
            share_hooks=True
        )
        steps, preps = list(zip(*pipeline.steps))

        # set up fit parameters (see sklearn fit_params notation)
        fit_params = {
            "%s__feed_attributes" % steps[2]: ['description'],
            "%s__output_attributes" % steps[2]: ['label']
        }

        prep_data = pipeline.fit_transform(
            X=data,
            **fit_params
        )

        cls.test_data = prep_data
Beispiel #4
0
    def test_vendor_product_match_hook(self):
        """Test vendor_product_hook."""
        hook = feature_hooks.vendor_product_match_hook

        from nvdlib.nvd import NVD
        feed = NVD.from_feeds(['recent'])
        feed.update()

        cve_list = list(feed.cves())

        # find application CPE
        cpe = cve = None
        for cve in cve_list:
            try:
                cpe = cve.configurations[0].cpe[0]
            except IndexError:
                continue

            if cpe.is_application():
                break

        assert all([cve, cpe]), "Failed to gather test data."

        vendor, product = cpe.vendor[0], cpe.product[0]

        # mock CVE with empty configurations instead of searching it
        empty_cve = type('emptyCVE', (), {})
        empty_cve.configurations = []

        cve_dict = {cve.cve_id: cve, 'empty': empty_cve}

        # empty configurations
        features = [(product, 'NUM')]
        result = hook.__call__(features, 0, cve_dict, 'empty')

        self.assertFalse(result)

        # non existing ID
        result = hook.__call__(features, 0, cve_dict, 'non-existing-id')

        self.assertFalse(result)

        # matching product
        result = hook.__call__(features, 0, cve_dict, cve.cve_id)

        self.assertTrue(result)

        # matching vendor
        features = [(vendor, 'NUM')]
        result = hook.__call__(features, 0, cve_dict, cve.cve_id)

        self.assertTrue(result)

        # neither of vendor and product match
        features = [('mismatch', 'NUM')]
        result = hook.__call__(features, 0, cve_dict, cve.cve_id)

        self.assertFalse(result)
def main(argv):
    """Run."""
    args = parse_args(argv=argv)

    if args.csv:
        # TODO
        raise NotImplementedError("The feature has not been implemented yet."
                                  " Sorry for the inconvenience.")
    else:
        print("Getting NVD Feed...")
        feed = NVD.from_feeds(feed_names=args.nvd_feeds)
        feed.update()
        data = list(feed.cves())  # generator

    cve_dict = {cve.cve_id: cve for cve in data}

    # set up default argument for vendor-product feature hook
    feature_hooks.vendor_product_match_hook.default_kwargs = {
        'cve_dict': cve_dict
    }

    training_pipeline = Pipeline(
        steps=[('nvd_feed_preprocessor',
                preprocessing.NVDFeedPreprocessor(
                    attributes=['cve_id', 'description'])),
               ('label_preprocessor',
                preprocessing.LabelPreprocessor(
                    feed_attributes=['project', 'description'],
                    output_attributes=['cve_id', 'description'],
                    hook=transformers.Hook(
                        key='label_hook', reuse=True, func=utils.find_))),
               ('nltk_preprocessor',
                preprocessing.NLTKPreprocessor(
                    feed_attributes=['description'],
                    output_attributes=['cve_id', 'label'])),
               ('feature_extractor',
                transformers.FeatureExtractor(feature_hooks=FEATURE_HOOKS,
                                              share_hooks=True)
                ), ('classifier', transformers.NBClassifier())])

    start_time = time()
    print("Training started")

    try:
        classifier = training_pipeline.fit_transform(X=data)
    finally:
        print(f"Training finished in {time() - start_time} seconds")

    if args.export:
        classifier.export(args.export_dir)
    def test_nvd_to_dataframe(self):
        """Test NVD feed transformation to pandas.DataFrame object."""
        from pandas import DataFrame

        # test without handler
        cves = list(NVD.from_feeds(['recent']).cves())
        df = utils.nvd_to_dataframe(cves)

        self.assertIsNotNone(df)
        self.assertIsInstance(df, DataFrame)

        # test with handler - should raise cause of missing gh token
        with self.assertRaises(StatusError):
            _ = utils.nvd_to_dataframe(cves, handler=GitHubHandler)
Beispiel #7
0
def main():
    args = __parser.parse_args()

    if args.csv:
        # TODO
        raise NotImplementedError("The feature has not been implemented yet."
                                  " Sorry for the inconvenience.")
    else:
        print("Getting NVD Feed...")
        feed = NVD.from_feeds(feed_names=args.nvd_feeds)
        feed.update()
        data = feed.cves()  # generator

    # transform and transform the data with the pre-processing pipeline
    print("Preprocessing...")
    prep_pipeline = pipelines.get_preprocessing_pipeline()
    steps, preps = list(zip(*prep_pipeline.steps))
    fit_params = {
        "%s__feed_attributes" % steps[2]: ['description'],
        "%s__output_attributes" % steps[2]: ['label']
    }

    prep_data = prep_pipeline.fit_transform(X=data, **fit_params)
    print("Preprocessing done.")

    prep_data = np.array(prep_data)
    if not prep_data.size > 0:
        print(
            "No data left after preprocessing. Check the data provided"
            " or modify preprocessing pipeline.",
            file=sys.stderr)
        exit(1)

    # split the data to labels
    features, labels = prep_data[:, 0], prep_data[:, 1]

    print("Training...")
    # transform and transform the data with the training pipeline
    train_pipeline = pipelines.get_training_pipeline(
        feature_hooks=FEATURE_HOOKS)

    classifier = train_pipeline.fit_transform(X=features, y=labels)
    print("Training done.")

    if args.export:
        classifier.export()
def _get_test_data(n_records=500):
    """Return preprocessed data.

    Note: used for tests only."""
    from nvdlib.nvd import NVD

    feed = NVD.from_feeds(feed_names=['recent'])
    # download and update
    feed.update()

    # get the sample cves
    __cve_iter = feed.cves()
    __records = n_records

    data = [next(__cve_iter)
            for _ in range(__records)]  # only first n to speed up tests

    return data
    def setUpClass(cls):
        """Return preprocessed extracted labeled features."""
        from nvdlib.nvd import NVD

        feed = NVD.from_feeds(feed_names=['recent'])
        # download and update
        feed.update()

        # get the sample cves
        __cve_iter = feed.cves()

        data = list(__cve_iter)

        data, labels = extract_labeled_features(
            data=data,
            nvd_attributes=['project', 'description'],
            nltk_feed_attributes=['description'])

        cls.data, cls.labels = data, labels
Beispiel #10
0
def _get_extracted_test_data():
    """Return preprocessed data.

    Note: used for tests only."""
    from nvdlib.nvd import NVD

    feed = NVD.from_feeds(feed_names=['recent'])
    # download and update
    feed.update()

    # get the sample cves
    __cve_iter = feed.cves()
    __records = 500

    data = [next(__cve_iter) for _ in range(__records)]

    data, labels = extract_labeled_features(data=data,
                                            attributes=['description'])

    return data, labels
def _get_extracted_test_data():
    """Return preprocessed data.

    Note: used for tests only.
    """
    from nvdlib.nvd import NVD

    feed = NVD.from_feeds(feed_names=['recent'])
    # download and update
    feed.update()

    # get the sample cves
    __cve_iter = feed.cves()

    data = list(__cve_iter)

    data, labels = extract_labeled_features(
        data=data, nvd_attributes=['project', 'description'])

    return data, labels
Beispiel #12
0
def _get_test_data(n_records=500):
    """Return preprocessed data.

    Note: used for tests only.
    """
    from nvdlib.nvd import NVD

    feed = NVD.from_feeds(feed_names=['recent'])
    # download and update
    feed.update()

    # get the sample cves
    cve_iter = feed.cves()

    data = list()
    for i, cve in enumerate(cve_iter):
        if i >= n_records:
            break
        data.append(cve)

    return data
def main():
    args = __parser.parse_args()

    if args.csv:
        # TODO
        raise NotImplementedError("The feature has not been implemented yet."
                                  " Sorry for the inconvenience.")
    else:
        print("Getting NVD Feed...")
        feed = NVD.from_feeds(feed_names=args.nvd_feeds)
        feed.update()
        data = feed.cves()  # generator

    # transform and transform the data with the pre-processing pipeline
    print("Preprocessing...")
    features, labels = pipelines.extract_labeled_features(
        data=data,
        feature_hooks=FEATURE_HOOKS,
        attributes=['description'],
    )
    print("Preprocessing done.")

    if not data:
        print("No data left after preprocessing. Check the data provided"
              " or modify preprocessing pipeline.", file=sys.stderr)
        exit(1)

    path_to_classifier = os.path.join(os.getcwd(), args.path_to_classifier)
    classifier = classifiers.NBClassifier.restore(path_to_classifier)

    # noinspection PyPep8Naming
    X_train, X_test, y_train, y_test = train_test_split(  # pylint: disable=invalid-name
        features, labels,
        test_size=0.2,
        random_state=np.random.randint(0, 100),
        shuffle=True
    )

    if args.eval:
        score = classifier.evaluate(X_test, y_test, sample=True, n=args.num_candidates)

        print("Evaluation accuracy:", score)

    if args.cross_validate:
        score = classifiers.cross_validate(
            classifier,
            X_train,
            y_train,
            sample=True,
            n=args.num_candidates,
            folds=args.cross_validation_folds,
            shuffle=True
        )

        print("Cross-validation results:")
        print("-------------------------")
        print("\tIntermediate results:\n")
        print(
            "\n".join("\t\tFold {}: {}".format(fold, np.round(value, 2))
                      for fold, value in enumerate(score.values))
        )
        print("\tAccuracy: %.2f (+/- %.4f)" % (np.round(score.mean, 2), np.round(score.std * 2, 4)))