def select(self): schema = Utils.get_available_feature_schema( self.data, force_base_entries=self.force_base_entries) X = [ Utils.get_element_feature(schema, event.details, event.date) for event in self.data ] Y = [ Utils.get_target_function_value(self.data, event) for event in self.data ] gbm = lgb.LGBMRegressor(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, objective='regression') gbm.fit(X, Y) features = gbm.feature_importances_ for i in range(len(features)): if features[i] >= self.threshold: features[i] = True else: features[i] = False if self.force_base_entries: features[schema.index('__days_diff')] = True features[schema.index('__ref_number')] = True features[schema.index('__cvss_expl')] = True return Utils.get_filtered_schema(schema, features)
def _get_generator(self, data_gen): import random if isinstance(data_gen, tuple): X = data_gen[0] Y = data_gen[1] else: X = [ np.array( Utils.get_element_feature(self.schema, event.details, event.date)) for event in data_gen ] Y = [ Utils.get_target_function_value(self.data, event) for event in data_gen ] gen_data = list(zip(X, Y)) random.shuffle(gen_data) X, Y = zip(*gen_data) i = 0 while True: samples_X = [] samples_Y = [] for b in range(self.batch_size): if i == len(data_gen): i = 0 gen_data = list(zip(X, Y)) random.shuffle(gen_data) X, Y = zip(*gen_data) samples_X.append(X[i]) samples_Y.append(Y[i]) i += 1 yield np.array(samples_X), np.array(samples_Y)
def learn_by_data(self): if self.data is None or self.schema is None: raise ValueError("You can't fit the model without having a data and schema") X = [Utils.get_element_feature(self.schema, event.details, event.date) for event in self.data] if self.use_reduction: self.pca.fit(X) X = self.pca.transform(X) Y = [Utils.get_target_function_value(self.data, event) for event in self.data] self.model.fit(X, Y)
def get_perf(data, schema, n_splits=5, selection_method=ValidationMethod.ShuffleSplit, is_nn=False, epochs=100, batch_size=1): ret = { 'exp_var': 0, 'max_error': 0, 'mean_abs_error': 0, 'mean_squared_error': 0, 'mean_squared_log_error': 0, 'median_abs_error': 0, 'r2': 0 } X = [ np.array( Utils.get_element_feature(schema, event.details, event.date)) for event in data ] Y = [Utils.get_target_function_value(data, event) for event in data] X = np.array(X) Y = np.array(Y) if selection_method == ValidationMethod.KFold: selector = KFold(n_splits=n_splits, shuffle=True) else: selector = ShuffleSplit(n_splits=n_splits, test_size=.25, random_state=0) for train_index, test_index in selector.split(X): X_train, X_test = X[train_index.astype(int)], X[test_index.astype( int)] y_train, y_test = Y[train_index.astype(int)], Y[test_index.astype( int)] if is_nn: model = TessNeuralModel(schema=schema, epochs=epochs, batch_size=batch_size) else: model = TessSVRModel(schema=schema) model.learn(X_train, y_train) partial_res = PerformanceValidator.get_perf_model( model, X_test, y_test) ret = { k: ret.get(k, 0) + partial_res.get(k, 0) for k in ret.keys() } for key in ret.keys(): ret[key] = ret[key] / 5 return ret
def learn_by_data(self): if self.data is None or self.schema is None: raise ValueError( "You can't fit the model without having a data and schema") steps = len(self.data) // self.batch_size X = [ Utils.get_element_feature(self.schema, event.details, event.date) for event in self.data ] if self.use_reduction: self.pca.fit(X) X = self.pca.transform(X) Y = [ Utils.get_target_function_value(self.data, event) for event in self.data ] self.model.fit_generator(generator=self._get_generator((X, Y)), epochs=self.epochs, steps_per_epoch=steps)
def main(): if len(sys.argv) < 2 or (sys.argv[1] != 'evaluate' and sys.argv[1] != 'learn'): usage() sys.exit(1) mode = sys.argv[1] sys.argv.remove(mode) parser = getparser(mode) args = parser.parse_args() if mode == 'evaluate': if args.cm.lower() == 'shuffle': cross_mode = ValidationMethod.ShuffleSplit else: cross_mode = ValidationMethod.KFold print('Parsing data...') parser = HistoryParser(abspath(args.d)) parser.load() print('Selecting features...') if args.skip_selection: schema = Utils.get_available_feature_schema(parser.data) else: schema = FeatureSelection(parser.data, threshold=args.ts).select() print('Starting validation...') print(PerformanceValidator.get_perf(parser.data, schema, selection_method=cross_mode, n_splits=5, is_nn=args.nn, epochs=args.e, batch_size=args.bs)) elif mode == 'learn': parser = HistoryParser(abspath(args.d)) parser.load() if args.skip_selection: schema = Utils.get_available_feature_schema(parser.data) else: schema = FeatureSelection(parser.data, threshold=args.ts).select() if args.nn: model = TessNeuralModel(parser.data, schema, epochs=args.e, batch_size=args.bs, n_components=args.nc) else: model = TessSVRModel(parser.data, schema, n_components=args.nc) model.learn_by_data() model.save(abspath(args.o + '.tess'))
def load(self): if self.skip_capec == True and self.skip_keywords == True and self.skip_cwe == True: raise AttributeError( "Can't skip capec entries, cwe elements and keywords all together!" ) if self.data is not None: return self.data self.data = [] key_parser = KeywordsParser() cve = CVESearch() with open(self.data_path, mode='r') as csv_file: csv_reader = csv.DictReader(csv_file, delimiter=',') today = datetime.now(tz=None) for row in csv_reader: info = cve.find_cve_by_id(row['id']) published = datetime.strptime(info['publishedDate'], '%Y-%m-%dT%H:%MZ') if (today - published).days < self.min_age: print('Ignoring event for {}'.format(row['id'])) continue vuln_details = None for item in self.data: if item.id == row['id']: vuln_details = item.details if vuln_details is None: target = None if 'target' in row.keys(): target = row['target'] vuln_details = Utils.get_vulnerability(row['id'], cve, key_parser, self.skip_capec, self.skip_keywords, self.skip_cwe, target=target) if vuln_details is None: continue vuln_event = VulnerabilityEvent(row['id'], row['data'], row['outcome'], vuln_details) self.data.append(vuln_event)
def get_exploitability(self, vulnerability, time): if self.model is None: raise ValueError("Model is not set") return vulnerability.e_score * self.model.predict( [Utils.get_element_feature(self.schema, vulnerability, time)])
def get_exploitability(self, vulnerability, time): return vulnerability.e_score * self.model.predict([Utils.get_element_feature(self.schema, vulnerability, time)])