def _load_attention_data( run_id: str, local_mlflow_dir: str ) -> Optional[ Tuple[ Dict[str, Dict[str, float]], Dict[str, List[Tuple[str, float]]], Dict[str, float], Dict[str, List[Tuple[str, float]]], ] ]: attention_weights = load_attention_weights(run_id, local_mlflow_dir) if attention_weights is None: return None attention_importances = calculate_attention_importances(attention_weights) shared_attention_importances = { k: v for k, v in attention_importances.items() if len(v) > 1 } shared_attention_weights = calculate_shared_attention_weights( attention_weights, shared_attention_importances ) print("Number of features", len(attention_weights)) print("Total number of hidden features", len(attention_importances)) print("Number of shared hidden features", len(shared_attention_importances)) print( "Number of features with >0.5 shared embedding", len([k for k, v in shared_attention_weights.items() if float(v) > 0.5]), ) return ( attention_weights, attention_importances, shared_attention_weights, shared_attention_importances, )
def create_graph_visualization_reference( run_id: str, reference_run_id: str, local_mlflow_dir: str, threshold: float, run_name: str, use_node_mapping: bool = True, colored_connections_color: str="red", ): attention_weights = load_attention_weights(run_id, local_mlflow_dir) if attention_weights is None: return None feature_node_mapping = convert_to_node_mapping( [x for x in attention_weights], use_node_mapping ) colored_connections = gather_colored_connections( reference_run_id=reference_run_id, local_mlflow_dir=local_mlflow_dir, attention_weights=attention_weights, feature_node_mapping=feature_node_mapping, ) return _create_graph_visualization( attention_weights, threshold=threshold, run_name=run_name, node_mapping=feature_node_mapping, colored_connections=colored_connections, colored_connections_color=colored_connections_color, )
def create_graph_visualization( run_id: str, local_mlflow_dir: str, threshold: float, run_name: str, use_node_mapping: bool = True, ) -> Optional[Dict[str, str]]: attention_weights = load_attention_weights(run_id, local_mlflow_dir) if attention_weights is None: return None feature_node_mapping = convert_to_node_mapping( [x for x in attention_weights], use_node_mapping ) return _create_graph_visualization( attention_weights, threshold=threshold, run_name=run_name, node_mapping=feature_node_mapping, colored_connections=set(), )
def gather_colored_connections( reference_run_id: str, local_mlflow_dir: str, attention_weights: Dict[str, Dict[str, float]], feature_node_mapping: Dict[str, str], ) -> Set[Tuple[str, str]]: if reference_run_id is None: return set() reference_attention_weights = load_attention_weights( reference_run_id, local_mlflow_dir ) if reference_attention_weights is None: return set() reference_connections = set( [ (child, parent) for child, parents in reference_attention_weights.items() for parent in parents ] ) return calculate_colored_connections( reference_connections, attention_weights, feature_node_mapping )
def load_prediction_df( run_id: str, local_mlflow_dir: str, num_percentiles: int = 10, convert_df: bool = True, feature_replacements: Dict[str, str] = {}, cluster_threshold: float = 0.9, ) -> Optional[pd.DataFrame]: run_mlflow_dir = Path(local_mlflow_dir + run_id) if not run_mlflow_dir.is_dir(): print("Run {} is not in local MlFlow dir".format(run_id)) input_frequency_dict = load_input_frequency_dict(run_id, local_mlflow_dir) if input_frequency_dict is None: print("No frequency file for run {} in local MlFlow dir".format(run_id)) elif len(feature_replacements) > 0: for child, parent in feature_replacements.items(): input_frequency_dict[parent] = input_frequency_dict.get(parent, {}) input_frequency_dict[parent]["absolute_frequency"] = input_frequency_dict[ parent ].get("absolute_frequency", 0) + input_frequency_dict.get(child, {}).get( "absolute_frequency", 0 ) input_frequency_dict[parent]["absolue_frequency"] = input_frequency_dict[ parent ].get("absolue_frequency", 0) + input_frequency_dict.get(child, {}).get( "absolue_frequency", 0 ) input_frequency_dict[parent]["relative_frequency"] = input_frequency_dict[ parent ].get("relative_frequency", 0) + input_frequency_dict.get(child, {}).get( "relative_frequency", 0 ) output_percentile_dict = load_output_percentile_mapping_dict( run_id, local_mlflow_dir ) if output_percentile_dict is None: print("No output percentile file for run {} in local MlFlow dir".format(run_id)) attention_weights = load_attention_weights(run_id, local_mlflow_dir) if attention_weights is None: print("No attention file for run {} in local MlFlow dir".format(run_id)) attention_weights = {} run_prediction_output_path = Path( local_mlflow_dir + run_id + "/artifacts/prediction_output.csv" ) if not run_prediction_output_path.exists(): print("No prediction output file for run {} in local MlFlow dir".format(run_id)) return None prediction_output_df = pd.read_csv(run_prediction_output_path) if convert_df: prediction_output_df = convert_prediction_df( prediction_df=prediction_output_df, input_frequency_dict=input_frequency_dict, output_percentile_dict=output_percentile_dict, num_percentiles=num_percentiles, feature_replacements=feature_replacements, attention_weights=attention_weights, cluster_threshold=cluster_threshold, ) return prediction_output_df