def run(features_select, target, run_name, experiment_id): """1. Run cross-validation based on features_select 2. Log model characteristics 3. Return list of features sorted by importance""" ## Log run name with mlflow.start_run(run_name=run_name, experiment_id=experiment_id): ## Finding score X, Y = df_select[features_select], df_select[target] score = np.mean(cross_val_score(model, X, Y, cv=5)) ## feature importance clf = model.fit(X, Y) feature_importance = [[ features_select[i], clf.coef_[0][i], abs(np.sum(df_select[features_select[i]]) * clf.coef_[0][i]) ] for i in range(len(features_select))] feature_importance_sorted = sorted(feature_importance, key=itemgetter(2), reverse=True) ## Logging score, model and importance mlflow.log_metric("CV score", score) mlflow.sklearn.log_model(model, "Logistic Regression") mlflow.log_dict( [[x[0], f"coefficient = {str(x[1])}", f"importance = {str(x[2])}"] for x in feature_importance_sorted], "feature_importance.yml") mlflow.end_run() return feature_importance_sorted
def _mlflow_log_dict(dictionary: Dict[str, Any], prefix: str = "", log_type: Optional[str] = None): """The function of MLflow. Logs any value by its type from dictionary recursively. Args: dictionary: Values to log as dictionary. prefix: Prefix for parameter name (if the parameter is composite). log_type: The entity of logging (param, metric, artifact, image, etc.). Raises: ValueError: If meets unknown type or log_type for logging in MLflow (add new case if needed). """ for name, value in dictionary.items(): if name in EXCLUDE_PARAMS: continue name = name.replace("*", "") if prefix not in STAGE_PARAMS and prefix: name = f"{prefix}/{name}" if log_type == "dict": mlflow.log_dict(dictionary, name) elif isinstance(value, dict): _mlflow_log_dict(value, name, log_type) elif log_type == "param": try: mlflow.log_param(name, value) except mlflow.exceptions.MlflowException: continue elif isinstance(value, (Directory, File)) or log_type == "artifact": mlflow.log_artifact(value) elif isinstance(value, Number): mlflow.log_metric(name, value) else: raise ValueError(f"Unknown type of logging value: {type(value)}")
def main(cfg: DictConfig) -> None: # set up mlflow experiment id mlflow.set_tracking_uri(f"file://{to_absolute_path(cfg.path_to_mlflow)}") experiment = mlflow.get_experiment_by_name(cfg.experiment_name) if experiment is not None: # fetch existing experiment id run_kwargs = {'experiment_id': experiment.experiment_id} else: # create new experiment experiment_id = mlflow.create_experiment(cfg.experiment_name) run_kwargs = {'experiment_id': experiment_id} # run the training with mlflow tracking with mlflow.start_run(**run_kwargs) as active_run: setup_gpu(cfg.gpu_cfg) training_cfg = OmegaConf.to_object(cfg.training_cfg) # convert to python dictionary scaling_cfg = to_absolute_path(cfg.scaling_cfg) dataloader = DataLoaderReco.DataLoader(training_cfg, scaling_cfg) dl_config = dataloader.config model = model = MyGNN(dl_config) input_shape, _ = dataloader.get_shape() # print(input_shape[0]) # compile_build = tf.ones(input_shape[0], dtype=tf.float32, name=None) model.build(list(input_shape[0])) compile_model(model, dl_config["SetupNN"]["mode"], dl_config["SetupNN"]["learning_rate"]) fit_hist = run_training(model, dataloader, False, cfg.log_suffix) mlflow.log_dict(training_cfg, 'input_cfg/training_cfg.yaml') mlflow.log_artifact(scaling_cfg, 'input_cfg') mlflow.log_artifact(to_absolute_path("Training_SNNv0.py"), 'input_cfg') mlflow.log_artifact(to_absolute_path("../commonReco.py"), 'input_cfg') mlflow.log_artifacts('.hydra', 'input_cfg/hydra') mlflow.log_artifact('Training_SNNv0.log', 'input_cfg/hydra') mlflow.log_param('run_id', active_run.info.run_id) print(f'\nTraining has finished! Corresponding MLflow experiment name (ID): {cfg.experiment_name}({run_kwargs["experiment_id"]}), and run ID: {active_run.info.run_id}\n')
def merge_multiedges( graph: object, node_attrib_name: str = "weight", default_node_weight: float = 0, use_aspect_clustering: bool = False, n_clusters: int = None, ) -> nx.Graph: if use_aspect_clustering and n_clusters is not None: aspect_df = pd.DataFrame( [(n, data["importance"]) for n, data in graph.nodes(data=True)], columns=["text", "importance"], ) aspect_cluster_representants = cluster_embeddings_with_spacy( aspect_df, n_clusters) mlflow.log_dict(aspect_cluster_representants, "aspect_cluster_representants.json") logger.info("Create a new graph without multiple edges between nodes.") graph_new = nx.Graph() for u, v, data in graph.edges(data=True): w = data[ node_attrib_name] if node_attrib_name in data else default_node_weight if use_aspect_clustering and n_clusters is not None: u = aspect_cluster_representants[u] v = aspect_cluster_representants[v] if graph_new.has_edge(u, v): graph_new[u][v][node_attrib_name] += w else: graph_new.add_edge(u, v, **{node_attrib_name: w}) logger.info( "Copy nodes attributes from multi edge graph to flattened one.") nx.set_node_attributes(graph_new, dict(graph.nodes.items())) return graph_new
def _log_posttraining_metadata(estimator, spark_model, params): if _is_parameter_search_estimator(estimator): try: # Fetch environment-specific tags (e.g., user and source) to ensure that lineage # information is consistent with the parent run child_tags = context_registry.resolve_tags() child_tags.update( {MLFLOW_AUTOLOGGING: AUTOLOGGING_INTEGRATION_NAME}) _create_child_runs_for_parameter_search( parent_estimator=estimator, parent_model=spark_model, parent_run=mlflow.active_run(), child_tags=child_tags, ) except Exception: import traceback msg = ( "Encountered exception during creation of child runs for parameter search." " Child runs may be missing. Exception: {}".format( traceback.format_exc())) _logger.warning(msg) estimator_param_maps = _get_tuning_param_maps( estimator, estimator._autologging_metadata.uid_to_indexed_name_map) metrics_dict, best_index = _get_param_search_metrics_and_best_index( estimator, spark_model) _log_parameter_search_results_as_artifact( estimator_param_maps, metrics_dict, mlflow.active_run().info.run_id) # Log best_param_map as JSON artifact best_param_map = estimator_param_maps[best_index] mlflow.log_dict(best_param_map, artifact_file="best_parameters.json") # Log best_param_map as autologging parameters as well _log_estimator_params({ f"best_{param_name}": param_value for param_name, param_value in best_param_map.items() }) if log_models: if _should_log_model(spark_model): # TODO: support model signature mlflow.spark.log_model( spark_model, artifact_path="model", ) if _is_parameter_search_model(spark_model): mlflow.spark.log_model( spark_model.bestModel, artifact_path="best_model", ) else: _logger.warning( _get_warning_msg_for_skip_log_model(spark_model))
def compile_model(model, mode, learning_rate): # opt = tf.keras.optimizers.Nadam(learning_rate=learning_rate, beta_1=1e-4) opt = tf.keras.optimizers.Nadam(learning_rate=learning_rate, schedule_decay=1e-4) # opt = tf.keras.optimizers.Adam(learning_rate = learning_rate) CustomMSE.mode = mode metrics = [] if "dm" in mode: metrics.extend([my_acc, my_mse_ch, my_mse_neu]) if "p4" in mode: metrics.extend([my_mse_pt, my_mse_mass, pt_res, pt_res_rel, m2_res]) model.compile(loss=CustomMSE(), optimizer=opt, metrics=metrics) # log metric names for passing them during model loading metric_names = {(m if isinstance(m, str) else m.__name__): '' for m in metrics} mlflow.log_dict(metric_names, 'input_cfg/metric_names.json')
def evaluate_ner_seq_eval(self, batch_ner_labels, batch_ner_predictions, labels: List[str], partition, head_identifier): id2label = {} entity_labels = labels for idx, label in enumerate(entity_labels): if label.endswith('NP'): label = label[:2] + head_identifier.split('_')[-1] elif label == 'BERT_TOKEN': label = 'O' id2label[idx] = label ner_ground_truth = [[id2label[idx] for idx in seq] for seq in batch_ner_labels] ner_predictions = [[id2label[idx] for idx in seq] for seq in batch_ner_predictions] # Get results default_results = classification_report(y_true=ner_ground_truth, y_pred=ner_predictions, output_dict=True, digits=3, mode='default', scheme=IOB2) default_results['performance'] = performance_measure( y_true=ner_ground_truth, y_pred=ner_predictions) default_results = { metric_group1: {metric: float(value) for metric, value in metric_group2.items()} for metric_group1, metric_group2 in default_results.items() } strict_results = classification_report(y_true=ner_ground_truth, y_pred=ner_predictions, output_dict=True, digits=3, mode='strict', scheme=IOB2) strict_results['performance'] = performance_measure( y_true=ner_ground_truth, y_pred=ner_predictions) strict_results = { metric_group1: {metric: float(value) for metric, value in metric_group2.items()} for metric_group1, metric_group2 in strict_results.items() } mlflow.log_dict(dict(lenient=default_results, strict=strict_results), f"{partition}/{self.epoch}/{head_identifier}.json")
def our_paper_arrg_to_aht( graph: nx.MultiDiGraph, max_number_of_nodes: int, weight: str = "weight", alpha_coefficient: float = 0.5, use_aspect_clustering: bool = False, ) -> nx.Graph: logger.info("Generate Aspect Hierarchical Tree based on ARRG") # aspects_rank = calculate_hits(graph) aspects_rank = nx.in_degree_centrality(graph) # aspects_rank = calculate_weighted_page_rank(graph, "weight") graph = calculate_weight(graph=graph, ranks=aspects_rank, alpha_coefficient=alpha_coefficient) graph_flatten = merge_multiedges( graph, node_attrib_name=weight, default_node_weight=0, n_clusters=max_number_of_nodes, use_aspect_clustering=use_aspect_clustering, ) aspects_rank = [(aspect, score) for aspect, score in aspects_rank.items() if aspect in graph_flatten] sorted_nodes = sorted(list(aspects_rank), key=lambda node_value: node_value[1], reverse=True)[:max_number_of_nodes] mlflow.log_dict(dict(sorted_nodes), "arrg_aspect_ranks.json") arrg_relation_weights = { f"{source}->{target}": data[weight] for source, target, data in graph_flatten.edges(data=True) } mlflow.log_dict( dict( sorted(arrg_relation_weights.items(), key=lambda t: t[1], reverse=True)), "arrg_relation_weights.json", ) maximum_spanning_tree = nx.maximum_spanning_tree(graph_flatten, weight=weight) nx.set_node_attributes(maximum_spanning_tree, dict(graph_flatten.nodes.items())) return maximum_spanning_tree
def _generate_confusion_artifacts( self, artifact_dir: str, metadata: sequences.SequenceMetadata, model: models.BaseModel, test_dataset: tf.data.Dataset, ): prediction_output_calculator = analysis.PredictionOutputCalculator( metadata, model.prediction_model, ) prediction_output_calculator.write_prediction_output_for_dataset( test_dataset, out_file_name=artifact_dir + "prediction_output.csv", ) mlflow.log_dict(metadata.x_vocab, "x_vocab.json") mlflow.log_dict(metadata.y_vocab, "y_vocab.json")
def run(features_select, target, run_name, experiment_id): with mlflow.start_run(run_name=run_name, experiment_id=experiment_id): X, Y = df_select[features_select], df_select[target] score = np.mean(cross_val_score(model, X, Y, cv=5)) mlflow.log_metric("CV score", score) clf = model.fit(X, Y) feature_importance = sorted([[ features_select[i], abs(np.sum(df_select[features_select[i]]) * clf.coef_[0][i]) ] for i in range(len(features_select))], key=itemgetter(1), reverse=True) mlflow.sklearn.log_model(model, "Logistic Regression") mlflow.log_dict([[x[0], f"importance = {str(x[1])}"] for x in feature_importance], "feature_importance.yml") mlflow.end_run() return feature_importance
def _build_model( self, metadata: sequences.SequenceMetadata, base_knowledge: knowledge.BaseKnowledge, model: models.BaseModel, ) -> knowledge.BaseKnowledge: if (self.config.noise_to_add > 0 or self.config.noise_to_remove > 0 or self.config.attention_noise_to_remove > 0): noise_knowledge = knowledge.NoiseKnowledge(base_knowledge) noise_knowledge.remove_lowest_connections( percentage=self.config.attention_noise_to_remove, connections_reference_file=self.config. attention_weight_reference_file, ) noise_knowledge.add_random_connections( percentage=self.config.noise_to_add) noise_knowledge.remove_random_connections( percentage=self.config.noise_to_remove) mlflow.set_tag( "noise_type", "added{}_removed{}_threshold{}".format( self.config.noise_to_add, self.config.noise_to_remove, self.config.attention_noise_to_remove, ), ) ( original_connections_text, noise_connections_text, ) = noise_knowledge.get_text_connections() mlflow.log_dict( original_connections_text, "original_knowledge.json", ) mlflow.log_dict( noise_connections_text, "noise_knowledge.json", ) model.build(metadata, noise_knowledge) return noise_knowledge model.build(metadata, base_knowledge) return base_knowledge
def test_log_dict(subdir, extension): dictionary = {"k": "v"} filename = "data" + extension artifact_file = filename if subdir is None else posixpath.join( subdir, filename) with mlflow.start_run(): mlflow.log_dict(dictionary, artifact_file) artifact_path = None if subdir is None else posixpath.normpath(subdir) artifact_uri = mlflow.get_artifact_uri(artifact_path) run_artifact_dir = local_file_uri_to_path(artifact_uri) assert os.listdir(run_artifact_dir) == [filename] filepath = os.path.join(run_artifact_dir, filename) extension = os.path.splitext(filename)[1] with open(filepath) as f: loaded = yaml.load(f) if (extension in [".yml", ".yaml" ]) else json.load(f) assert loaded == dictionary
def compile_model(model, opt_name, learning_rate, strmetrics, schedule_decay=1e-4): opt = getattr(tf.keras.optimizers, opt_name)(learning_rate=learning_rate, schedule_decay=schedule_decay) metrics = [] for m in strmetrics: if "TauLosses" in m: m = eval(m) metrics.append(m) model.compile( loss=None, optimizer=opt, metrics=metrics, weighted_metrics=metrics) # loss is now defined in DeepTauModel # log metric names for passing them during model loading metric_names = {(m if isinstance(m, str) else m.__name__): '' for m in metrics} mlflow.log_dict(metric_names, 'input_cfg/metric_names.json')
def _log_pretraining_metadata(estimator, params): if params and isinstance(params, dict): estimator = estimator.copy(params) autologging_metadata = _gen_estimator_metadata(estimator) artifact_dict = {} param_map = _get_instance_param_map(estimator, autologging_metadata.uid_to_indexed_name_map) if _should_log_hierarchy(estimator): artifact_dict["hierarchy"] = autologging_metadata.hierarchy for param_search_estimator in autologging_metadata.param_search_estimators: param_search_estimator_name = ( f"{autologging_metadata.uid_to_indexed_name_map[param_search_estimator.uid]}" ) artifact_dict[param_search_estimator_name] = {} artifact_dict[param_search_estimator_name][ "tuning_parameter_map_list" ] = _get_tuning_param_maps( param_search_estimator, autologging_metadata.uid_to_indexed_name_map ) artifact_dict[param_search_estimator_name][ "tuned_estimator_parameter_map" ] = _get_instance_param_map_recursively( param_search_estimator.getEstimator(), 1, autologging_metadata.uid_to_indexed_name_map, ) if artifact_dict: mlflow.log_dict(artifact_dict, artifact_file="estimator_info.json") _log_estimator_params(param_map) mlflow.set_tags(_get_estimator_info_tags(estimator))
def test_log_dict(subdir, extension): dictionary = {"k": "v"} filename = "data" + extension artifact_file = filename if subdir is None else posixpath.join( subdir, filename) with mlflow.start_run(): mlflow.log_dict(dictionary, artifact_file) artifact_path = None if subdir is None else posixpath.normpath(subdir) artifact_uri = mlflow.get_artifact_uri(artifact_path) run_artifact_dir = local_file_uri_to_path(artifact_uri) assert os.listdir(run_artifact_dir) == [filename] filepath = os.path.join(run_artifact_dir, filename) extension = os.path.splitext(filename)[1] with open(filepath) as f: loaded = ( # Specify `Loader` to suppress the following deprecation warning: # https://github.com/yaml/pyyaml/wiki/PyYAML-yaml.load(input)-Deprecation yaml.load(f, Loader=yaml.SafeLoader) if (extension in [".yml", ".yaml"]) else json.load(f)) assert loaded == dictionary
def _mlflow_log_params_dict( dictionary: Dict[str, Any], prefix: Optional[str] = None, log_type: Optional[str] = None, exclude: Optional[List[str]] = None, ): """The function of MLflow. Logs any value by its type from dictionary recursively. Args: dictionary: Values to log as dictionary. prefix: Prefix for parameter name (if the parameter is composite). log_type: The entity of logging (param, metric, artifact, image, etc.). exclude: Keys in the dictionary to exclude from logging. Raises: ValueError: If meets unknown type or log_type for logging in MLflow (add new case if needed). """ for name, value in dictionary.items(): if exclude is not None and name in exclude: continue name = re.sub(r"\W", "", name) name = f"{prefix}/{name}" if prefix else name if log_type == "dict": mlflow.log_dict(dictionary, name) elif isinstance(value, dict): _mlflow_log_params_dict(value, name, log_type, exclude) elif log_type == "param": try: mlflow.log_param(name, value) except mlflow.exceptions.MlflowException: continue else: raise ValueError( f"Unknown type of logging value: type({value})={type(value)}")
def main( n_jobs: int, batch_size: int, aht_max_number_of_nodes: int, alpha_coefficient: float, experiment_id: Union[str, int], overwrite_neighborhood: bool, filter_graphs_to_intersected_vertices: bool, ): filter_graphs_to_intersected_vertices = bool( filter_graphs_to_intersected_vertices) for dataset_path, max_reviews in tqdm( datasets, desc="Amazon datasets processing..."): for experiment_name in [ experiment_name_enum.GERANI, experiment_name_enum.OUR_ALL_RULES, experiment_name_enum.OUR_TOP_1_RULES, ]: with mlflow.start_run( experiment_id=experiment_id, run_name= f"{experiment_name}-{dataset_path.stem}-{max_reviews}", ): mlflow.log_param("experiment_name", experiment_name) aspect_analysis = AspectAnalysis( input_path=dataset_path.as_posix(), output_path=settings.DEFAULT_OUTPUT_PATH / dataset_path.stem, experiment_name=experiment_name, jobs=n_jobs, batch_size=batch_size, max_docs=max_reviews, aht_max_number_of_nodes=aht_max_number_of_nodes, alpha_coefficient=alpha_coefficient, ) if experiment_name == experiment_name_enum.OUR_ALL_RULES: aspect_analysis.our_pipeline() elif experiment_name == experiment_name_enum.GERANI: aspect_analysis.gerani_pipeline() elif experiment_name == experiment_name_enum.OUR_TOP_1_RULES: aspect_analysis.our_pipeline_top_n_rules_per_discourse_tree( ) else: raise Exception("Wrong experiment type") for conceptnet_graph_path in tqdm( CONCEPTNET_GRAPH_TOOL_GRAPHS, desc="Conceptnet graph analysis..."): with mlflow.start_run( experiment_id=experiment_id, run_name=conceptnet_graph_path.stem, nested=True, # run_id=f'{experiment_id}-{conceptnet_graph_path.stem}' ) as run_conceptnet: mlflow.log_param("dataset_path", dataset_path) mlflow.log_param("dataset_name", dataset_path.stem) mlflow.log_param("method", experiment_name) mlflow.log_param("max_docs", max_reviews) mlflow.log_param("batch_size", batch_size) mlflow.log_param("n_jobs", n_jobs) mlflow.log_param("conceptnet_graph_path", conceptnet_graph_path) mlflow.log_param("conceptnet_graph_name", conceptnet_graph_path.stem) mlflow.log_param("aht_max_number_of_nodes", aht_max_number_of_nodes) mlflow.log_param("alpha_coefficient", alpha_coefficient) png_file_path = ( aspect_analysis.paths.experiment_path / f"shortest_paths_correlation_{conceptnet_graph_path.stem}.png" ) if png_file_path.exists( ) and not overwrite_neighborhood: logger.info( f"{png_file_path.as_posix()} has already exist, skipping to the next setting." ) mlflow.log_artifact(png_file_path.as_posix()) else: df = prepare_hierarchies_neighborhood( experiments_path=aspect_analysis.paths, conceptnet_graph_path=conceptnet_graph_path, filter_graphs_to_intersected_vertices= filter_graphs_to_intersected_vertices, ) logger.info( f"Shortest Paths pairs - data frame: {len(df)}" ) df = df[~((df.shortest_distance_aspect_graph. isin(VALUES_TO_SKIP)) | (df.shortest_distance_conceptnet. isin(VALUES_TO_SKIP)))] df.drop_duplicates(subset=["aspect_1", "aspect_2"]) mlflow.log_metric("number_of_shortest_paths", len(df)) logger.info( f"Shortest Paths pairs - data frame, without no paths and duplicates: {len(df)}" ) mlflow.log_dict( pd.DataFrame( df.shortest_distance_aspect_graph. value_counts()).to_dict(orient="index"), "shortest_distance_aspect_graph_distribution.json", ) mlflow.log_dict( pd.DataFrame( df.shortest_distance_conceptnet. value_counts()).to_dict(orient="index"), "shortest_distance_conceptnet_distribution.json", ) df = df[df.shortest_distance_aspect_graph <= 6] matplotlib.rc_file_defaults() ax1 = sns.set_style(style=None, rc=None) fig, ax1 = plt.subplots() sns_plot = sns.lineplot( x=df.shortest_distance_aspect_graph, y=df.shortest_distance_conceptnet, ax=ax1, ) ax2 = ax1.twinx() df_aspect_graph_distance_distribution = pd.DataFrame( df.shortest_distance_aspect_graph.value_counts( )) df_aspect_graph_distance_distribution.reset_index( inplace=True) df_aspect_graph_distance_distribution.sort_values( by="index", inplace=True) sns.barplot( x=df_aspect_graph_distance_distribution[ "index"], y=df_aspect_graph_distance_distribution. shortest_distance_aspect_graph, alpha=0.5, ax=ax2, ) logger.info( f"Shortest Paths correlation figure will be saved in {png_file_path}" ) df.sort_values(by="shortest_distance_conceptnet", inplace=True) pearson_correlation = df.shortest_distance_aspect_graph.corr( df.shortest_distance_conceptnet) spearman_correlation = df.shortest_distance_aspect_graph.corr( df.shortest_distance_conceptnet, method="spearman") kendall_correlation = df.shortest_distance_aspect_graph.corr( df.shortest_distance_conceptnet, method="kendall") df_csv_path = ( aspect_analysis.paths.experiment_path / "df.csv") df.to_csv(df_csv_path.as_posix()) mlflow.log_artifact(df_csv_path.as_posix()) mlflow.log_metrics({ "pearson": pearson_correlation, "spearman": spearman_correlation, "kendall": kendall_correlation, }) sns_plot.figure.savefig(png_file_path.as_posix()) plt.close() mlflow.log_artifact(png_file_path.as_posix())
def _log_posttraining_metadata(estimator, spark_model, params, input_df): if _is_parameter_search_estimator(estimator): try: # Fetch environment-specific tags (e.g., user and source) to ensure that lineage # information is consistent with the parent run child_tags = context_registry.resolve_tags() child_tags.update( {MLFLOW_AUTOLOGGING: AUTOLOGGING_INTEGRATION_NAME}) _create_child_runs_for_parameter_search( parent_estimator=estimator, parent_model=spark_model, parent_run=mlflow.active_run(), child_tags=child_tags, ) except Exception: import traceback msg = ( "Encountered exception during creation of child runs for parameter search." " Child runs may be missing. Exception: {}".format( traceback.format_exc())) _logger.warning(msg) estimator_param_maps = _get_tuning_param_maps( estimator, estimator._autologging_metadata.uid_to_indexed_name_map) metrics_dict, best_index = _get_param_search_metrics_and_best_index( estimator, spark_model) _log_parameter_search_results_as_artifact( estimator_param_maps, metrics_dict, mlflow.active_run().info.run_id) # Log best_param_map as JSON artifact best_param_map = estimator_param_maps[best_index] mlflow.log_dict(best_param_map, artifact_file="best_parameters.json") # Log best_param_map as autologging parameters as well _log_estimator_params({ f"best_{param_name}": param_value for param_name, param_value in best_param_map.items() }) if log_models: if _should_log_model(spark_model): from mlflow.models import infer_signature from mlflow.pyspark.ml._autolog import ( cast_spark_df_with_vector_to_array, get_feature_cols, ) from mlflow.spark import _find_and_set_features_col_as_vector_if_needed from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() def _get_input_example_as_pd_df(): feature_cols = list( get_feature_cols(input_df.schema, spark_model)) limited_input_df = input_df.select(feature_cols).limit( INPUT_EXAMPLE_SAMPLE_ROWS) return cast_spark_df_with_vector_to_array( limited_input_df).toPandas() def _infer_model_signature(input_example_slice): input_slice_df = _find_and_set_features_col_as_vector_if_needed( spark.createDataFrame(input_example_slice), spark_model) model_output = spark_model.transform(input_slice_df).drop( *input_slice_df.columns) return infer_signature(input_example_slice, model_output.toPandas()) input_example, signature = resolve_input_example_and_signature( _get_input_example_as_pd_df, _infer_model_signature, log_input_examples, log_model_signatures, _logger, ) mlflow.spark.log_model( spark_model, artifact_path="model", registered_model_name=registered_model_name, input_example=input_example, signature=signature, ) if _is_parameter_search_model(spark_model): mlflow.spark.log_model( spark_model.bestModel, artifact_path="best_model", ) else: _logger.warning( _get_warning_msg_for_skip_log_model(spark_model))
def _log_percentile_mapping_to_mlflow(self): percentile_mapping = self._create_percentile_mapping() mlflow.log_dict(percentile_mapping, "percentile_mapping.json")
def train_epochs(epochs, batch_size, token_size, hidden_size, embedding_size): # Read data x_train_full = open("../input/wili-2018/x_train.txt").read().splitlines() y_train_full = open("../input/wili-2018/y_train.txt").read().splitlines() x_test_full = open("../input/wili-2018/x_test.txt").read().splitlines() y_test_full = open("../input/wili-2018/y_test.txt").read().splitlines() # Get encoders char_vocab = Dictionary().char_dict(x_train_full) lang_vocab = Dictionary().lang_dict(y_train_full) # Convert data x_train_idx, y_train_idx = Encoder().encode_labeled_data( x_train_full, y_train_full, char_vocab, lang_vocab) x_test_idx, y_test_idx = Encoder().encode_labeled_data( x_test_full, y_test_full, char_vocab, lang_vocab) x_train, x_val, y_train, y_val = train_test_split(x_train_idx, y_train_idx, test_size=0.15) train_data = [(x, y) for x, y in zip(x_train, y_train)] val_data = [(x, y) for x, y in zip(x_val, y_val)] test_data = [(x, y) for x, y in zip(x_test_idx, y_test_idx)] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if not torch.cuda.is_available(): logging.warning("WARNING: CUDA is not available.") criterion = torch.nn.CrossEntropyLoss(reduction='sum') bidirectional = False ntokens = len(char_vocab) nlabels = len(lang_vocab) pad_index = char_vocab.pad_index model, optimizer = get_model( ntokens, embedding_size, hidden_size, nlabels, bidirectional, pad_index, device) with mlflow.start_run(): mlflow.log_metrics( { "train samples": len(train_data), "val samples": len(val_data), "test samples": len(test_data) } ) mlflow.log_dict(lang_vocab.token2idx, "lang_vocab.json") mlflow.log_dict(char_vocab.token2idx, "char_vocab.json") params = {'epochs': epochs, 'batch_size': batch_size, 'token_size': token_size, 'hidden_size': hidden_size, 'embedding_size': embedding_size} mlflow.log_dict(params, "params.json") logging.info(f'Training cross-validation model for {epochs} epochs') for epoch in range(epochs): train_acc = train(model, optimizer, train_data, batch_size, token_size, criterion, device) logging.info(f'| epoch {epoch:02d} | train accuracy={train_acc:.1f}%') validate(model, val_data, batch_size, token_size, device, lang_vocab, tag='val', epoch=epoch) validate(model, test_data, batch_size, token_size, device, lang_vocab, tag='test', epoch=epoch) mlflow.pytorch.log_model(model, f'{epoch:02d}.model') mlflow.pytorch.log_model(model, 'model')
config.update("jax_debug_nans", True) # TODO: use **kwargs to reduce params if __name__ == "__main__": problem = ModelContClassifier() problem = ProblemWraper(problem) with open(problem.HPARAMS_PATH, "r") as hfile: hparams = json.load(hfile) mlflow.set_tracking_uri(hparams['meta']["mlflow_uri"]) mlflow.set_experiment(hparams['meta']["name"]) with mlflow.start_run(run_name=hparams['meta']["method"] + "-" + hparams["meta"]["optimizer"]) as run: mlflow.log_dict(hparams, artifact_file="hparams/hparams.json") mlflow.log_text("", artifact_file="output/_touch.txt") artifact_uri = mlflow.get_artifact_uri("output/") hparams["meta"]["output_dir"] = artifact_uri print(f"URI: {artifact_uri}") start_time = datetime.now() if hparams["n_perturbs"] > 1: for perturb in range(hparams["n_perturbs"]): print(f"Running perturb {perturb}") continuation = ContinuationCreator( problem=problem, hparams=hparams, key=perturb).get_continuation_method() continuation.run() else: continuation = ContinuationCreator(
import pprint import pandas_datareader import pandas from pandas_profiling import ProfileReport import great_expectations as ge from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler if __name__ == "__main__": with mlflow.start_run(run_name="check_verify_data") as run: mlflow.set_tag("mlflow.runName", "check_verify_data") df = pandas.read_csv("./data/raw/data.csv") describe_to_dict = df.describe().to_dict() mlflow.log_dict(describe_to_dict, "describe_data.json") pd_df_ge = ge.from_pandas(df) assert pd_df_ge.expect_column_values_to_match_strftime_format( "Date", "%Y-%m-%d").success == True assert pd_df_ge.expect_column_values_to_be_of_type( "High", "float").success == True assert pd_df_ge.expect_column_values_to_be_of_type( "Low", "float").success == True assert pd_df_ge.expect_column_values_to_be_of_type( "Open", "float").success == True assert pd_df_ge.expect_column_values_to_be_of_type( "Close", "float").success == True assert pd_df_ge.expect_column_values_to_be_of_type( "Volume", "long").success == True
import mlflow if __name__ == '__main__': dictionary = { "conference": "Data + AI", "location": "virtual/global", "year": 2021, "theme": "Future is open" } with mlflow.start_run(): # Log a dictionary as a JSON file under the run's root artifact directory mlflow.log_dict(dictionary, "data.json") # Log a dictionary as a YAML file in a subdirectory of the run's root artifact directory mlflow.log_dict(dictionary, "dir/data.yml") # If the file extension doesn't exist or match any of [".json", ".yaml", ".yml"], # JSON format is used. mlflow.log_dict(dictionary, "data.txt")
# dump de modelo opcional. Afinal eles podem ser bem grandes e estourar armazenamento if dict_args.pop('log_model'): print("Gonna LOG the model...") mlflow.sklearn.log_model(pbg, 'pbg_model_spacy') # ## Vai fazer dump tema-topico temas = json.load(open('temas.json')) tema_topico = {} topicos = pbg.get_topics(20) for tema, topico in pbg.map_class_.items(): if tema > 0: tema = str(tema) tema_topico[temas[tema]] = topicos[topico] mlflow.log_dict( tema_topico, "tema_topico.json", ) import dump_temas_e_topicos tema_topico = dump_temas_e_topicos.get_tema_topico(pbg) topicos_dict_sem_tema = dump_temas_e_topicos.get_topicos_sem_tema(pbg) DUMP_FOLDER = 'tema_topico' os.makedirs(DUMP_FOLDER, exist_ok=True) tema_topico_path = f'{DUMP_FOLDER}/tema_topico_{RUN_ID}.json' json.dump(tema_topico, open(tema_topico_path, 'w'), indent=4 * ' ', ensure_ascii=False)
def _log_model_with_mlflow( self, model: OpenstfRegressor, experiment_name: str, model_type: str, model_specs: ModelSpecificationDataClass, report: Report, phase: str, **kwargs, ) -> None: """Log model with MLflow. Note: **kwargs has extra information to be logged with mlflow """ # Get previous run id models_df = self._find_models( experiment_name, max_results=1 ) # returns latest model if not models_df.empty: previous_run_id = models_df["run_id"][ 0 ] # Use [0] to only get latest run id else: self.logger.info( "No previous model found in MLflow", experiment_name=experiment_name ) previous_run_id = None # Set tags to the run, can be used to filter on the UI mlflow.set_tag("run_id", mlflow.active_run().info.run_id) mlflow.set_tag("phase", phase) # phase can be Training or Hyperparameter_opt mlflow.set_tag("Previous_version_id", previous_run_id) mlflow.set_tag("model_type", model_type) mlflow.set_tag("prediction_job", experiment_name) # Add feature names, target, feature modules, metrics and params to the run mlflow.set_tag( "feature_names", model_specs.feature_names[1:] ) # feature names are 1+ columns mlflow.set_tag("target", model_specs.feature_names[0]) # target is first column mlflow.set_tag("feature_modules", model_specs.feature_modules) mlflow.log_metrics(report.metrics) model_specs.hyper_params.update(model.get_params()) mlflow.log_params(model_specs.hyper_params) # Process args for key, value in kwargs.items(): if isinstance(value, dict): mlflow.log_dict(value, f"{key}.json") elif isinstance(value, str) or isinstance(value, int): mlflow.set_tag(key, value) else: self.logger.warning( f"Couldn't log {key}, {type(key)} not supported", experiment_name=experiment_name, ) # Log the model to the run. Signature describes model input and output scheme mlflow.sklearn.log_model( sk_model=model, artifact_path="model", signature=report.signature ) self.logger.info("Model saved with MLflow", experiment_name=experiment_name)
def generate_english_graph( graph_path: Union[str, Path], relation_types: Set[str] = None, synonymous_relations: Optional[List[str]] = None, ) -> gt.Graph: with mlflow.start_run( experiment_id=5, run_name=str(graph_path), nested=True, ): df = pd.read_csv(settings.CONCEPTNET_CSV_EN_PATH, index_col=0) relation_types = list(relation_types) mlflow.log_dict( { "relations": relation_types or "all", "synonymous_relations": synonymous_relations or "None", "all_relations": len(df) }, "filter-relations.json", ) if synonymous_relations: synonyms_df = df[df.relation.isin(synonymous_relations)] all_synonyms = list( set(synonyms_df.target.tolist() + synonyms_df.source.tolist())) synonyms = defaultdict(list) for s, t in tqdm( zip(synonyms_df.source.tolist(), synonyms_df.target.tolist()), desc="Generating synonyms mapping", total=len(synonyms_df), ): s = str(s) t = str(t) synonyms[s] += [t] synonyms[t] += [s] synonyms = {k: set(v).union({k}) for k, v in synonyms.items()} if relation_types is not None: df = df[df.relation.isin(relation_types)] mlflow.log_dict( {k: int(v) for k, v in df.relation.value_counts().items()}, "relations.json") g = gt.Graph() e_relation = g.new_edge_property("string") v_aspect_name = g.new_vertex_property("string") if synonymous_relations: all_vertices_names = set(df.source.tolist() + df.target.tolist() + all_synonyms) else: all_vertices_names = set(df.source.tolist() + df.target.tolist()) vertices = {} for aspect_name in tqdm(all_vertices_names, desc="Vertices adding to the graph..."): v = g.add_vertex() vertices[aspect_name] = v v_aspect_name[v] = aspect_name g.vertex_properties["aspect_name"] = v_aspect_name def get_synonyms(vertex_name) -> List[str]: return synonyms[vertex_name] if vertex_name in synonyms else [ vertex_name ] edge_adding_errors = 0 for row in tqdm(df.itertuples(), desc="Edges adding to the graph...", total=len(df)): source = row.source # satellite target = row.target # nucleus # if N->S the reverse if row.relation in NUCLEUS_SATELLITE_RELATIONS: source, target = target, source if synonymous_relations: for s, t in product(get_synonyms(source), get_synonyms(target)): try: e = g.add_edge(vertices[s], vertices[t]) e_relation[e] = row.relation except KeyError: edge_adding_errors += 1 else: e = g.add_edge(vertices[source], vertices[target]) e_relation[e] = row.relation print(f"{edge_adding_errors} edges with errors skipped") mlflow.log_metrics({ "edge_adding_errors": edge_adding_errors, "n_edges": g.num_edges(), "n_vertices": g.num_vertices(), }) g.edge_properties["relation"] = e_relation g.save(str(graph_path)) return g
f1_train_norm = compute_f1(model, train_norm, 0) f1_train_anom = compute_f1(model, train_anom, 1) f1_test_norm = compute_f1(model, test_norm, 0) f1_test_anom = compute_f1(model, test_anom, 1) # log time end_time = time.time() # mark end run_time = end_time - start_time # calculate runtime based on fitting and scoring # save metrics idx = score_df.shape[0] print(f'P{idx}, n-estimators: {n_estimators}, max-depth: {max_depth}, n-bins: {n_bins}, max-samples: {max_samples}, max-features: {max_features}') score_df.loc[idx] = [n_estimators,max_depth,n_bins,max_samples,max_features,run_time, f1_train_norm,f1_train_anom,f1_test_norm,f1_test_anom] # log scoring dataframe mlflow.log_dict(score_df.to_dict(), 'score_df.json') # plot metrics ylabel_map = {'f1_train_norm':'F1-Score Training Normal', 'f1_train_anom':'F1-Score Training Anomaly', 'f1_test_norm':'F1-Score Testing Normal', 'f1_test_anom':'F1-Score Testing Anomaly', 'run_time': 'Run Time (Seconds)'} fig = plt.figure(figsize=[20,20]) idx = 1 # counter for subplots for metric in ['f1_train_norm','f1_train_anom','f1_test_norm','f1_test_anom','run_time']: for param in ['n_estimators','max_depth','n_bins','max_samples','max_features']: # setup figure ax = fig.add_subplot(5, 5, idx) xs,ys = score_df[param], score_df[metric] # scatter ax.scatter(xs, ys) # fit line z = np.polyfit(xs, ys, 1)
def main(cfg: DictConfig) -> None: # set up mlflow experiment id mlflow.set_tracking_uri(f"file://{to_absolute_path(cfg.path_to_mlflow)}") experiment = mlflow.get_experiment_by_name(cfg.experiment_name) if experiment is not None: run_kwargs = {'experiment_id': experiment.experiment_id} if cfg["pretrained"] is not None: # initialise with pretrained run, otherwise create a new run run_kwargs['run_id'] = cfg["pretrained"]["run_id"] else: # create new experiment experiment_id = mlflow.create_experiment(cfg.experiment_name) run_kwargs = {'experiment_id': experiment_id} # run the training with mlflow tracking with mlflow.start_run(**run_kwargs) as main_run: if cfg["pretrained"] is not None: mlflow.start_run(experiment_id=run_kwargs['experiment_id'], nested=True) active_run = mlflow.active_run() run_id = active_run.info.run_id setup_gpu(cfg.gpu_cfg) training_cfg = OmegaConf.to_object( cfg.training_cfg) # convert to python dictionary scaling_cfg = to_absolute_path(cfg.scaling_cfg) dataloader = DataLoader.DataLoader(training_cfg, scaling_cfg) setup = dataloader.config["SetupNN"] TauLosses.SetSFs(*setup["TauLossesSFs"]) print("loss consts:", TauLosses.Le_sf, TauLosses.Lmu_sf, TauLosses.Ltau_sf, TauLosses.Ljet_sf) if setup["using_new_loss"]: tf.config.run_functions_eagerly(True) netConf_full = dataloader.get_net_config() if dataloader.input_type == "Adversarial": model = create_model( netConf_full, dataloader.model_name, loss=setup["loss"], use_newloss=setup["using_new_loss"], use_AdvDataset=True, adv_param=dataloader.adversarial_parameter, n_adv_tau=dataloader.adv_batch_size, adv_learning_rate=dataloader.adv_learning_rate) else: model = create_model(netConf_full, dataloader.model_name, loss=setup["loss"], use_newloss=setup["using_new_loss"]) if cfg.pretrained is None: print( "Warning: no pretrained NN -> training will be started from scratch" ) old_opt = None else: print("Warning: training will be started from pretrained model.") print( f"Model: run_id={cfg.pretrained.run_id}, experiment_id={cfg.pretrained.experiment_id}, model={cfg.pretrained.starting_model}" ) path_to_pretrain = to_absolute_path( f'{cfg.path_to_mlflow}/{cfg.pretrained.experiment_id}/{cfg.pretrained.run_id}/artifacts/' ) old_model = load_model( path_to_pretrain + f"/model_checkpoints/{cfg.pretrained.starting_model}", compile=False, custom_objects=None) for layer in model.layers: weights_found = False for old_layer in old_model.layers: if layer.name == old_layer.name: layer.set_weights(old_layer.get_weights()) weights_found = True break if not weights_found: print(f"Weights for layer '{layer.name}' not found.") old_opt = old_model.optimizer old_vars = [var.name for var in old_model.trainable_variables] compile_model(model, setup["optimizer_name"], setup["learning_rate"], setup["metrics"], setup["schedule_decay"]) fit_hist = run_training(model, dataloader, False, cfg.log_suffix, setup["using_new_loss"], old_opt=old_opt) # log NN params for net_type in [ 'tau_net', 'comp_net', 'comp_merge_net', 'conv_2d_net', 'dense_net' ]: mlflow.log_params({ f'{net_type}_{k}': v for k, v in cfg.training_cfg.SetupNN[net_type].items() }) mlflow.log_params({ f'TauLossesSFs_{i}': v for i, v in enumerate(cfg.training_cfg.SetupNN.TauLossesSFs) }) with open( to_absolute_path( f'{cfg.path_to_mlflow}/{run_kwargs["experiment_id"]}/{run_id}/artifacts/model_summary.txt' )) as f: for l in f: if (s := 'Trainable params: ') in l: mlflow.log_param('n_train_params', int(l.split(s)[-1].replace(',', ''))) # log training related files mlflow.log_dict(training_cfg, 'input_cfg/training_cfg.yaml') mlflow.log_artifact(scaling_cfg, 'input_cfg') mlflow.log_artifact(to_absolute_path("Training_CNN.py"), 'input_cfg') mlflow.log_artifact(to_absolute_path("common.py"), 'input_cfg') # log hydra files mlflow.log_artifacts('.hydra', 'input_cfg/hydra') mlflow.log_artifact('Training_CNN.log', 'input_cfg/hydra') # log misc. info mlflow.log_param('run_id', run_id) mlflow.log_param('git_commit', _get_git_commit(to_absolute_path('.'))) print( f'\nTraining has finished! Corresponding MLflow experiment name (ID): {cfg.experiment_name}({run_kwargs["experiment_id"]}), and run ID: {run_id}\n' ) mlflow.end_run() # Temporary workaround to kill additional subprocesses that have not exited correctly try: current_process = psutil.Process() children = current_process.children(recursive=True) for child in children: child.kill() except: pass