def get_predictions_for_test_set(train_text_data, train_tag_data): X = pd.Series(train_text_data) y = pd.Series(train_tag_data) train_set_ratio = float(demisto.args()['trainSetRatio']) n_splits = int(1.0 / (1 - train_set_ratio)) skf = StratifiedKFold(n_splits=n_splits, shuffle=False, random_state=None) skf.get_n_splits(X, y) train_index, test_index = list(skf.split(X, y))[-1] X_train, X_test = list(X[train_index]), list(X[test_index]) y_train, y_test = list(y[train_index]), list(y[test_index]) model = demisto_ml.train_text_classifier(X_train, y_train) ft_test_predictions = demisto_ml.predict(model, X_test) y_pred = [{y_tuple[0]: y_tuple[1]} for y_tuple in ft_test_predictions] return y_test, y_pred
def main(): input = demisto.args()['input'] input_type = demisto.args()['inputType'] model_name = demisto.args()['modelName'] store_model = demisto.args()['storeModel'] == 'true' model_override = demisto.args().get('overrideExistingModel', 'false') == 'true' target_accuracy = float(demisto.args()['targetAccuracy']) text_field = demisto.args()['textField'] tag_fields = demisto.args()['tagField'].split(",") labels_mapping = get_phishing_map_labels(demisto.args()['phishingLabels']) keyword_min_score = float(demisto.args()['keywordMinScore']) return_predictions_on_test_set = demisto.args().get( 'returnPredictionsOnTestSet', 'false') == 'true' original_text_fields = demisto.args().get('originalTextFields', '') if input_type.endswith("filename"): data = read_files_by_name(input, input_type.split("_")[0].strip()) else: data = read_file(input, input_type) demisto.results(len(data)) if len(data) == 0: err = ['No incidents were received.'] err += [ 'Make sure that all arguments are set correctly and that incidents exist in the environment.' ] return_error(' '.join(err)) if len(data) < MIN_INCIDENTS_THRESHOLD: err = ['Only {} incident(s) were received.'.format(len(data))] err += [ 'Minimum number of incidents per label required for training is {}.' .format(MIN_INCIDENTS_THRESHOLD) ] err += [ 'Make sure that all arguments are set correctly and that enough incidents exist in the environment.' ] return_error('\n'.join(err)) data = set_tag_field(data, tag_fields) data, exist_labels_counter, missing_labels_counter = get_data_with_mapped_label( data, labels_mapping, DBOT_TAG_FIELD) validate_data_and_labels(data, exist_labels_counter, labels_mapping, missing_labels_counter) # print important words for each category find_keywords_bool = 'findKeywords' in demisto.args() and demisto.args( )['findKeywords'] == 'true' if find_keywords_bool: try: find_keywords(data, DBOT_TAG_FIELD, text_field, keyword_min_score) except Exception: pass X, y = get_X_and_y_from_data(data, text_field) test_index, train_index = get_train_and_test_sets_indices(X, y) X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index] y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index] model = demisto_ml.train_text_classifier(X_train, y_train) ft_test_predictions = demisto_ml.predict(model, X_test) y_pred = [{y_tuple[0]: y_tuple[1]} for y_tuple in ft_test_predictions] if return_predictions_on_test_set: return_file_result_with_predictions_on_test_set( data, original_text_fields, test_index, text_field, y_test, y_pred) if 'maxBelowThreshold' in demisto.args(): target_recall = 1 - float(demisto.args()['maxBelowThreshold']) else: target_recall = 0 [threshold_metrics_entry, per_class_entry] = get_ml_model_evaluation(y_test, y_pred, target_accuracy, target_recall, detailed=True) demisto.results(per_class_entry) # show results for the threshold found - last result so it will appear first confusion_matrix = output_model_evaluation( model_name=model_name, y_test=y_test, y_pred=y_pred, res=threshold_metrics_entry, context_field='DBotPhishingClassifier') if store_model: y_test_pred = [y_tuple[0] for y_tuple in ft_test_predictions] y_test_pred_prob = [y_tuple[1] for y_tuple in ft_test_predictions] threshold = float(threshold_metrics_entry['Contents']['threshold']) store_model_in_demisto(model_name, model_override, X, y, confusion_matrix, threshold, y_test_true=y_test, y_test_pred=y_test_pred, y_test_pred_prob=y_test_pred_prob) demisto.results( "Done training on {} samples model stored successfully".format( len(y))) else: demisto.results('Skip storing model')