def explain(shap_exp: Explanation, training_df, test_df, explanation_target): job = shap_exp.job job model = joblib.load(job.predictive_model.model_path) model = model[0] shap.initjs() explainer = shap.TreeExplainer(model) merged_df = pd.concat([training_df, test_df]) shap_values = explainer.shap_values(merged_df.drop(['trace_id', 'label'], 1)) encoder = retrieve_proper_encoder(job) encoder.decode(merged_df, job.encoding) encoder.decode(test_df, job.encoding) explanation_target_int = merged_df[merged_df['trace_id'] == explanation_target].index.item() + \ training_df.drop(['trace_id', 'label'], 1).shape[0] explanation_target_vector = test_df[test_df['trace_id'] == explanation_target].drop(['trace_id', 'label'], 1) expected_value = explainer.expected_value[0] if explainer.expected_value.size > 1 else explainer.expected_value shap_value = shap_values[explanation_target_int, :] if hasattr(shap_values,"size") else shap_values[0][ explanation_target_int, :] shap.force_plot(expected_value, shap_value, explanation_target_vector, show=False, matplotlib=True).savefig("temporal_shap.svg") f = open("temporal_shap.svg", "r") response = f.read() os.remove("temporal_shap.svg") return response
def explain(shap_exp: Explanation, training_df, test_df, explanation_target, prefix_target): job = shap_exp.job model = joblib.load(job.predictive_model.model_path) model = model[0] prefix_int = int(prefix_target.strip('/').split('_')[1]) - 1 explainer = _init_explainer(model) target_df = test_df[test_df['trace_id'] == explanation_target].iloc[prefix_int] #if explanation_target is None: # shap_values = explainer.shap_values(test_df.drop(['trace_id', 'label'], 1)) #else: # shap_values = explainer.shap_values(target_df.drop(['trace_id', 'label'], 0)) shap_values = _get_explanation(explainer, target_df.drop(['trace_id', 'label'], 0)) encoder = retrieve_proper_encoder(job) encoder.decode(test_df, job.encoding) target_df = test_df[test_df['trace_id'] == explanation_target].iloc[prefix_int] response = { explanation_target: [(target_df.keys()[index + 1] + ' = ' + target_df[target_df.keys()[index + 1]], shap_values[1][index]) for index in range(len(shap_values[1]))] } return response
def handle(self, *args, **kwargs): TARGET_MODEL = 68 job = Job.objects.filter(pk=TARGET_MODEL)[0] model = joblib.load(job.predictive_model.model_path) model = model[0] training_df, test_df = get_encoded_logs(job) EXPLANATION_TARGET = 2_3300 FEATURE_TARGET = 1 shap.initjs() explainer = shap.TreeExplainer(model) training_df = training_df.drop(['trace_id', 'label'], 1) shap_values = explainer.shap_values(training_df) encoder = retrieve_proper_encoder(job) encoder.decode(training_df, job.encoding) shap.force_plot(explainer.expected_value, shap_values[EXPLANATION_TARGET, :], training_df.iloc[EXPLANATION_TARGET, :], show=False, matplotlib=True).savefig('shap_plot_train_1_3.png')
def _multi_trace_temporal_stability(temporal_stability_exp: Explanation, training_df, test_df): if temporal_stability_exp.job.clustering.clustering_method != ClusteringMethods.NO_CLUSTER.value: raise NotImplementedError('Models with cluster-based approach are not yet supported') test_df['predicted'] = MODEL[PredictiveModels.CLASSIFICATION.value][ModelActions.PREDICT.value](temporal_stability_exp.job, test_df) encoder = retrieve_proper_encoder(temporal_stability_exp.job) encoder.decode(df=test_df, encoding=temporal_stability_exp.job.encoding) temp_df = DataFrame() temp_df['label'] = test_df['predicted'] encoder.decode(df=temp_df, encoding=temporal_stability_exp.job.encoding) test_df['predicted'] = temp_df['label'] exp_list = {} for trace_id in set(test_df['trace_id']): df = test_df[test_df['trace_id'] == trace_id].drop(['trace_id', 'label'], 1) exp = list(df['predicted']) last_row = df.tail(1) exp_list_1 = [(feat, str(last_row[feat].values[0])) for feat in last_row] exp_list[trace_id] = { exp_list_1[index][0]: {'value': exp_list_1[index][1], 'predicted': exp[index]} for index in range(len(exp)) } return exp_list
def get_decoded_df(request, pk): job = Job.objects.filter(pk=pk)[0] training_df, test_df = get_encoded_logs(job) training_df = training_df.drop(['trace_id'], 1) encoder = retrieve_proper_encoder(job) encoder.decode(training_df, job.encoding) return Response(training_df, status=200)
def lime_temporal_stability(lime_exp: Explanation, training_df, test_df, explanation_target): if explanation_target is None: return _multi_trace_lime_temporal_stability(lime_exp, training_df, test_df) else: model = joblib.load(lime_exp.predictive_model.model_path) if len(model) > 1: raise NotImplementedError( 'Models with cluster-based approach are not yet supported') features = list( training_df.drop(['trace_id', 'label'], 1).columns.values) explainer = _init_explainer(df=training_df.drop(['trace_id', 'label'], 1).as_matrix(), features=features, columns=list( training_df.drop(['trace_id', 'label'], 1).columns.values), mode=getModeType(model[0])) explanation_target_df = test_df[test_df['trace_id'] == explanation_target].drop( ['trace_id', 'label'], 1) explanation_target_df = explanation_target_df.reset_index(drop=True) exp = { row.index[max([ feat for feat in range(len(features)) if row.index[feat].startswith('prefix') and row[feat] != 0 ])]: _get_explanation(explainer, explanation_target_vector=row, model=model, features=features).as_list() for position, row in explanation_target_df.iterrows() } encoder = retrieve_proper_encoder(lime_exp.job) encoder.decode(df=explanation_target_df, encoding=lime_exp.job.encoding) return { explanation_target: { index: { el[0].split('=')[0]: { 'value': explanation_target_df.tail(1)[el[0].split('=')[0]]. values[0] if el[0].split('=')[1] != '0' else '', 'importance': el[1] } for el in exp[index] } for index in exp } }
def handle(self, *args, **kwargs): # get model TARGET_MODEL = 59 job = Job.objects.filter(pk=TARGET_MODEL)[0] model = joblib.load(job.predictive_model.model_path)[0] # load data training_df, test_df = get_encoded_logs(job) training_df['label'] = training_df['label'].astype(bool).astype(int) columns = list(training_df.columns.values) features = list( training_df.drop(['trace_id', 'label'], 1).columns.values) feature = 'Age_1' feature_grids, percentile_info = _get_grids( feature_values=training_df[feature].values, num_grid_points=10, grid_type=None, percentile_range='percentile', grid_range=None) custom_grids = [] indexs = [] for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)): custom_grids.append(x) print(features) fig, axes, summary_df = info_plots.target_plot( df=training_df, feature=feature, feature_name='feature value', cust_grid_points=custom_grids, target='label', show_percentile=False) fig.savefig('ice_plot_train_1_3_CType.png') lists = list(training_df[feature].values) for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)): indexs.append(lists.index(x)) encoder = retrieve_proper_encoder(job) encoder.decode(training_df, job.encoding) values = training_df[feature].values training_df lst = [] print(summary_df) if job.encoding.value_encoding != ValueEncodings.BOOLEAN.value: for x in range(len(indexs) - 1): lst.append({ 'value': values[indexs[x]], 'label': summary_df['label'][x], 'count': summary_df['count'][x], }) else: for x in range(summary_df.shape[0]): lst.append({ 'value': summary_df['display_column'][x], 'label': summary_df['label'][x], 'count': summary_df['count'][x], }) print(lst)
def explain(ice_exp: Explanation, training_df, test_df, explanation_target, prefix_target): job = ice_exp.job training_df = training_df.drop(['trace_id'], 1) if job.encoding.value_encoding == ValueEncodings.BOOLEAN.value: training_df['label'] = training_df['label'].astype(bool).astype( int) + 1 feature_grids, percentile_info = _get_grids( feature_values=training_df[explanation_target].values, num_grid_points=10, grid_type=None, percentile_range='percentile', grid_range=None) custom_grids = [ x for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)) ] fig, axes, summary_df = info_plots.target_plot( df=training_df, feature=explanation_target, feature_name='feature value', cust_grid_points=custom_grids, target='label', show_percentile=False) lists = list(training_df[explanation_target].values) indexs = [ lists.index(x) for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)) ] encoder = retrieve_proper_encoder(job) encoder.decode(training_df, job.encoding) values = training_df[explanation_target].values lst = [] if job.encoding.value_encoding != ValueEncodings.BOOLEAN.value: for x in range(len(indexs) - 1): lst.append({ 'value': values[indexs[x]], 'label': summary_df['label'][x], 'count': int(summary_df['count'][x]), }) else: for x in range(summary_df.shape[0]): lst.append({ 'value': summary_df['display_column'][x], 'label': summary_df['label'][x], 'count': int(summary_df['count'][x]), }) return lst
def explain(lime_exp: Explanation, training_df, test_df, explanation_target=1, prefix_target=None): model = joblib.load(lime_exp.predictive_model.model_path) if len(model) > 1: raise NotImplementedError( 'Models with cluster-based approach are not yet supported') # get the actual explanation features = list(training_df.drop(['trace_id', 'label'], 1).columns.values) explainer = _init_explainer(df=training_df.drop(['trace_id', 'label'], 1).as_matrix(), features=features, columns=list( training_df.drop(['trace_id', 'label'], 1).columns.values), mode=getModeType(model[0])) explanation_target_vector = test_df[ test_df['trace_id'] == explanation_target].drop(['trace_id', 'label'], 1).tail(1).squeeze() exp = _get_explanation(explainer=explainer, explanation_target_vector=explanation_target_vector, model=model, features=features) # show plot # exp.show_in_notebook(show_table=True) # exp.as_pyplot_figure().show() # exp.save_to_file('/tmp/oi.html') # alternative visualisation # exp.as_map() encoder = retrieve_proper_encoder(lime_exp.job) exp_list = exp.as_list() explanation_target_df = explanation_target_vector.to_frame().T encoder.decode(df=explanation_target_df, encoding=lime_exp.job.encoding) return { e[0].split('=')[0]: (str(explanation_target_df[e[0].split('=')[0]].values[0]), e[1]) for e in exp_list }
def shap_temporal_stability(shap_exp: Explanation, training_df, test_df, explanation_target): if explanation_target is None: return _multi_trace_shap_temporal_stability(shap_exp, training_df, test_df) else: model = joblib.load(shap_exp.predictive_model.model_path)[0] features = list( training_df.drop(['trace_id', 'label'], 1).columns.values) explainer = _init_explainer(model) explanation_target_df = test_df[test_df['trace_id'] == explanation_target].drop( ['trace_id', 'label'], 1) explanation_target_df = explanation_target_df.reset_index(drop=True) exp = { row.index[max([ feat for feat in range(len(features)) if row.index[feat].startswith('prefix') and row[feat] != 0 ])]: _get_explanation(explainer, row) for position, row in explanation_target_df.iterrows() } encoder = retrieve_proper_encoder(shap_exp.job) encoder.decode(df=explanation_target_df, encoding=shap_exp.job.encoding) return { explanation_target: { index: { explanation_target_df.keys()[idx]: { 'value': explanation_target_df.iloc[list( explanation_target_df.keys()).index('prefix_1')][ explanation_target_df.keys()[idx]], 'importance': exp[index][1][idx] } for idx in range(len(exp[index][1])) } for index in exp } }
def compute_confusion_matrix(ts, gold, job_obj): encoder = retrieve_proper_encoder(job_obj) encoder.decode(df=gold, encoding=job_obj.encoding) trace_ids = set(gold['trace_id']) confusion_matrix = { 'tp': [ str(trace_id) for trace_id in trace_ids if (str(trace_id) in ts) and (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] == 'true') and (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] == ('true' if boolean(gold[gold['trace_id'] == trace_id]['label'].values[0]) else 'false')) ], 'tn': [ str(trace_id) for trace_id in trace_ids if (str(trace_id) in ts) and (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] == 'false') and (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] == ('true' if boolean(gold[gold['trace_id'] == trace_id]['label'].values[0]) else 'false')) ], 'fp': [ str(trace_id) for trace_id in trace_ids if (str(trace_id) in ts) and (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] == 'true') and (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] != ('true' if boolean(gold[gold['trace_id'] == trace_id]['label'].values[0]) else 'false')) ], 'fn': [ str(trace_id) for trace_id in trace_ids if (str(trace_id) in ts) and (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] == 'false') and (ts[str(trace_id)][PREFIX_ + str(len(ts[str(trace_id)]))]['predicted'] != ('true' if boolean(gold[gold['trace_id'] == trace_id]['label'].values[0]) else 'false')) ] } return confusion_matrix
def get_unique_values(request, pk): job = Job.objects.filter(pk=pk)[0] training_df, test_df = get_encoded_logs(job) decoded_training_df = training_df.copy() decoded_testing_df = test_df.copy() training_df = training_df.drop(['trace_id', 'label'], 1) encoder = retrieve_proper_encoder(job) encoder.decode(df=decoded_training_df, encoding=job.encoding) encoder.decode(df=decoded_testing_df, encoding=job.encoding) result_df = {} for key in training_df.keys(): result_decoded_df = list( set(list(training_df[key]) + list(test_df[key]))) result_encoded_df = list( set( list(decoded_training_df[key]) + list(decoded_testing_df[key]))) result_df[key] = {} for k in range(len(result_decoded_df)): result_df[key][result_encoded_df[k]] = result_decoded_df[k] return Response(result_df, status=200)
def _multi_trace_shap_temporal_stability(shap_exp: Explanation, training_df, test_df): #TODO: FIX FROM LIME_WRAPPER TO SHAP_WRAPPER model = joblib.load(shap_exp.predictive_model.model_path)[0] if len(model) > 1: raise NotImplementedError( 'Models with cluster-based approach are not yet supported') features = list(training_df.drop(['trace_id', 'label'], 1).columns.values) explainer = _init_explainer(model) #TODO: FILTER TO BE REMOVED BEFORE DEPLOY # test_df = test_df.head(100) exp = {} for trace_id in set(test_df['trace_id']): df = test_df[test_df['trace_id'] == trace_id].drop( ['trace_id', 'label'], 1) df = df.reset_index(drop=True) if not any([feat.startswith('prefix_') for feat in features]) and len(df) == 1: exp[trace_id] = { 'prefix_': _get_explanation(explainer, row) for position, row in df.iterrows() } else: exp[trace_id] = { row.index[max([ feat for feat in range(len(features)) if row.index[feat].startswith('prefix') and row[feat] != 0 ])]: _get_explanation(explainer, row) for position, row in df.iterrows() } encoder = retrieve_proper_encoder(shap_exp.job) encoder.decode(df=test_df, encoding=shap_exp.job.encoding) if shap_exp.job.encoding.value_encoding == ValueEncodings.BOOLEAN.value: for col in test_df: test_df[col] = test_df[col].apply(lambda x: 'False' if x == '0' else x) return { trace_id: { index: { el[0].split('=')[0]: { 'value': str(test_df[test_df['trace_id'] == trace_id].tail(1)[ el[0].split('=')[0]].values[0]) if el[0].split('=')[1] != '0' else '', 'importance': el[1] } for el in exp[trace_id][index] } for index in exp[trace_id] } for trace_id in set(test_df['trace_id']) }
def _multi_trace_lime_temporal_stability(lime_exp: Explanation, training_df, test_df): model = joblib.load(lime_exp.predictive_model.model_path) if len(model) > 1: raise NotImplementedError( 'Models with cluster-based approach are not yet supported') features = list(training_df.drop(['trace_id', 'label'], 1).columns.values) explainer = _init_explainer(df=training_df.drop(['trace_id', 'label'], 1).as_matrix(), features=features, columns=list( training_df.drop(['trace_id', 'label'], 1).columns.values), mode=getModeType(model[0])) #TODO: FILTER TO BE REMOVED BEFORE DEPLOY # test_df = test_df.head(100) exp = {} for trace_id in set(test_df['trace_id']): df = test_df[test_df['trace_id'] == trace_id].drop( ['trace_id', 'label'], 1) # filterded_df = pd.DataFrame() # try: # filterded_df = filterded_df.append(df.head(30).tail(1)) # except: # pass # try: # filterded_df = filterded_df.append(df.head(60).tail(1)) # except: # pass # try: # filterded_df = filterded_df.append(df.head(90).tail(1)) # except: # pass # # df = filterded_df df = df.reset_index(drop=True) if not any([feat.startswith('prefix_') for feat in features]) and len(df) == 1: exp[trace_id] = { 'prefix_': _get_explanation(explainer, explanation_target_vector=row, model=model, features=features).as_list() for position, row in df.iterrows() } else: exp[trace_id] = { row.index[max([ feat for feat in range(len(features)) if row.index[feat].startswith('prefix') and row[feat] != 0 ])]: _get_explanation(explainer, explanation_target_vector=row, model=model, features=features).as_list() for position, row in df.iterrows() } # exp[trace_id] = { # row.index[max([ feat for feat in range(len(features)) if row.index[feat].startswith('prefix') and row[feat] != 0 ])]: # _get_explanation( # explainer, # explanation_target_vector=row, # model=model, # features=features # ).as_list() # for position, row in df.iterrows() # } encoder = retrieve_proper_encoder(lime_exp.job) encoder.decode(df=test_df, encoding=lime_exp.job.encoding) if lime_exp.job.encoding.value_encoding == ValueEncodings.BOOLEAN.value: for col in test_df: test_df[col] = test_df[col].apply(lambda x: 'False' if x == '0' else x) return { trace_id: { index: { el[0].split('=')[0]: { 'value': str(test_df[test_df['trace_id'] == trace_id].tail(1)[ el[0].split('=')[0]].values[0]) if el[0].split('=')[1] != '0' else '', 'importance': el[1] } for el in exp[trace_id][index] } for index in exp[trace_id] } for trace_id in set(test_df['trace_id']) }