def run(features_select, target, run_name, experiment_id):
    """1. Run cross-validation based on features_select
     2. Log model characteristics
     3. Return list of features sorted by importance"""
    ## Log run name
    with mlflow.start_run(run_name=run_name, experiment_id=experiment_id):
        ## Finding score
        X, Y = df_select[features_select], df_select[target]
        score = np.mean(cross_val_score(model, X, Y, cv=5))

        ## feature importance
        clf = model.fit(X, Y)
        feature_importance = [[
            features_select[i], clf.coef_[0][i],
            abs(np.sum(df_select[features_select[i]]) * clf.coef_[0][i])
        ] for i in range(len(features_select))]
        feature_importance_sorted = sorted(feature_importance,
                                           key=itemgetter(2),
                                           reverse=True)

        ## Logging score, model and importance
        mlflow.log_metric("CV score", score)
        mlflow.sklearn.log_model(model, "Logistic Regression")
        mlflow.log_dict(
            [[x[0], f"coefficient = {str(x[1])}", f"importance = {str(x[2])}"]
             for x in feature_importance_sorted], "feature_importance.yml")
        mlflow.end_run()
    return feature_importance_sorted
Example #2
0
def _mlflow_log_dict(dictionary: Dict[str, Any], prefix: str = "", log_type: Optional[str] = None):
    """The function of MLflow. Logs any value by its type from dictionary recursively.

    Args:
        dictionary: Values to log as dictionary.
        prefix: Prefix for parameter name (if the parameter is composite).
        log_type: The entity of logging (param, metric, artifact, image, etc.).

    Raises:
        ValueError: If meets unknown type or log_type for logging in MLflow
            (add new case if needed).
    """
    for name, value in dictionary.items():
        if name in EXCLUDE_PARAMS:
            continue

        name = name.replace("*", "")
        if prefix not in STAGE_PARAMS and prefix:
            name = f"{prefix}/{name}"

        if log_type == "dict":
            mlflow.log_dict(dictionary, name)
        elif isinstance(value, dict):
            _mlflow_log_dict(value, name, log_type)
        elif log_type == "param":
            try:
                mlflow.log_param(name, value)
            except mlflow.exceptions.MlflowException:
                continue
        elif isinstance(value, (Directory, File)) or log_type == "artifact":
            mlflow.log_artifact(value)
        elif isinstance(value, Number):
            mlflow.log_metric(name, value)
        else:
            raise ValueError(f"Unknown type of logging value: {type(value)}")
Example #3
0
def main(cfg: DictConfig) -> None:
    # set up mlflow experiment id
    mlflow.set_tracking_uri(f"file://{to_absolute_path(cfg.path_to_mlflow)}")
    experiment = mlflow.get_experiment_by_name(cfg.experiment_name)
    if experiment is not None: # fetch existing experiment id
        run_kwargs = {'experiment_id': experiment.experiment_id} 
    else: # create new experiment
        experiment_id = mlflow.create_experiment(cfg.experiment_name)
        run_kwargs = {'experiment_id': experiment_id} 
    
    # run the training with mlflow tracking
    with mlflow.start_run(**run_kwargs) as active_run:
        setup_gpu(cfg.gpu_cfg)
        training_cfg = OmegaConf.to_object(cfg.training_cfg) # convert to python dictionary
        scaling_cfg = to_absolute_path(cfg.scaling_cfg)
        dataloader = DataLoaderReco.DataLoader(training_cfg, scaling_cfg)

        dl_config =  dataloader.config
        model = model = MyGNN(dl_config)
        input_shape, _  = dataloader.get_shape()
        # print(input_shape[0])
        # compile_build = tf.ones(input_shape[0], dtype=tf.float32, name=None)
        model.build(list(input_shape[0]))
        compile_model(model, dl_config["SetupNN"]["mode"], dl_config["SetupNN"]["learning_rate"])
        fit_hist = run_training(model, dataloader, False, cfg.log_suffix)

        mlflow.log_dict(training_cfg, 'input_cfg/training_cfg.yaml')
        mlflow.log_artifact(scaling_cfg, 'input_cfg')
        mlflow.log_artifact(to_absolute_path("Training_SNNv0.py"), 'input_cfg')
        mlflow.log_artifact(to_absolute_path("../commonReco.py"), 'input_cfg')
        mlflow.log_artifacts('.hydra', 'input_cfg/hydra')
        mlflow.log_artifact('Training_SNNv0.log', 'input_cfg/hydra')
        mlflow.log_param('run_id', active_run.info.run_id)
        print(f'\nTraining has finished! Corresponding MLflow experiment name (ID): {cfg.experiment_name}({run_kwargs["experiment_id"]}), and run ID: {active_run.info.run_id}\n')
Example #4
0
def merge_multiedges(
    graph: object,
    node_attrib_name: str = "weight",
    default_node_weight: float = 0,
    use_aspect_clustering: bool = False,
    n_clusters: int = None,
) -> nx.Graph:
    if use_aspect_clustering and n_clusters is not None:
        aspect_df = pd.DataFrame(
            [(n, data["importance"]) for n, data in graph.nodes(data=True)],
            columns=["text", "importance"],
        )
        aspect_cluster_representants = cluster_embeddings_with_spacy(
            aspect_df, n_clusters)
        mlflow.log_dict(aspect_cluster_representants,
                        "aspect_cluster_representants.json")

    logger.info("Create a new graph without multiple edges between nodes.")
    graph_new = nx.Graph()
    for u, v, data in graph.edges(data=True):
        w = data[
            node_attrib_name] if node_attrib_name in data else default_node_weight
        if use_aspect_clustering and n_clusters is not None:
            u = aspect_cluster_representants[u]
            v = aspect_cluster_representants[v]
        if graph_new.has_edge(u, v):
            graph_new[u][v][node_attrib_name] += w
        else:
            graph_new.add_edge(u, v, **{node_attrib_name: w})

    logger.info(
        "Copy nodes attributes from multi edge graph to flattened one.")
    nx.set_node_attributes(graph_new, dict(graph.nodes.items()))

    return graph_new
Example #5
0
    def _log_posttraining_metadata(estimator, spark_model, params):

        if _is_parameter_search_estimator(estimator):
            try:
                # Fetch environment-specific tags (e.g., user and source) to ensure that lineage
                # information is consistent with the parent run
                child_tags = context_registry.resolve_tags()
                child_tags.update(
                    {MLFLOW_AUTOLOGGING: AUTOLOGGING_INTEGRATION_NAME})
                _create_child_runs_for_parameter_search(
                    parent_estimator=estimator,
                    parent_model=spark_model,
                    parent_run=mlflow.active_run(),
                    child_tags=child_tags,
                )
            except Exception:
                import traceback

                msg = (
                    "Encountered exception during creation of child runs for parameter search."
                    " Child runs may be missing. Exception: {}".format(
                        traceback.format_exc()))
                _logger.warning(msg)

            estimator_param_maps = _get_tuning_param_maps(
                estimator,
                estimator._autologging_metadata.uid_to_indexed_name_map)

            metrics_dict, best_index = _get_param_search_metrics_and_best_index(
                estimator, spark_model)
            _log_parameter_search_results_as_artifact(
                estimator_param_maps, metrics_dict,
                mlflow.active_run().info.run_id)

            # Log best_param_map as JSON artifact
            best_param_map = estimator_param_maps[best_index]
            mlflow.log_dict(best_param_map,
                            artifact_file="best_parameters.json")

            # Log best_param_map as autologging parameters as well
            _log_estimator_params({
                f"best_{param_name}": param_value
                for param_name, param_value in best_param_map.items()
            })

        if log_models:
            if _should_log_model(spark_model):
                # TODO: support model signature
                mlflow.spark.log_model(
                    spark_model,
                    artifact_path="model",
                )
                if _is_parameter_search_model(spark_model):
                    mlflow.spark.log_model(
                        spark_model.bestModel,
                        artifact_path="best_model",
                    )
            else:
                _logger.warning(
                    _get_warning_msg_for_skip_log_model(spark_model))
Example #6
0
def compile_model(model, mode, learning_rate):
    # opt = tf.keras.optimizers.Nadam(learning_rate=learning_rate, beta_1=1e-4)
    opt = tf.keras.optimizers.Nadam(learning_rate=learning_rate, schedule_decay=1e-4)
    # opt = tf.keras.optimizers.Adam(learning_rate = learning_rate)
    CustomMSE.mode = mode
    metrics = []
    if "dm" in mode:
        metrics.extend([my_acc, my_mse_ch, my_mse_neu])
    if "p4" in mode:
        metrics.extend([my_mse_pt, my_mse_mass, pt_res, pt_res_rel, m2_res])
    model.compile(loss=CustomMSE(), optimizer=opt, metrics=metrics)
    
    # log metric names for passing them during model loading
    metric_names = {(m if isinstance(m, str) else m.__name__): '' for m in metrics}
    mlflow.log_dict(metric_names, 'input_cfg/metric_names.json')
Example #7
0
    def evaluate_ner_seq_eval(self, batch_ner_labels, batch_ner_predictions,
                              labels: List[str], partition, head_identifier):
        id2label = {}
        entity_labels = labels
        for idx, label in enumerate(entity_labels):
            if label.endswith('NP'):
                label = label[:2] + head_identifier.split('_')[-1]
            elif label == 'BERT_TOKEN':
                label = 'O'
            id2label[idx] = label
        ner_ground_truth = [[id2label[idx] for idx in seq]
                            for seq in batch_ner_labels]
        ner_predictions = [[id2label[idx] for idx in seq]
                           for seq in batch_ner_predictions]

        # Get results
        default_results = classification_report(y_true=ner_ground_truth,
                                                y_pred=ner_predictions,
                                                output_dict=True,
                                                digits=3,
                                                mode='default',
                                                scheme=IOB2)
        default_results['performance'] = performance_measure(
            y_true=ner_ground_truth, y_pred=ner_predictions)
        default_results = {
            metric_group1:
            {metric: float(value)
             for metric, value in metric_group2.items()}
            for metric_group1, metric_group2 in default_results.items()
        }

        strict_results = classification_report(y_true=ner_ground_truth,
                                               y_pred=ner_predictions,
                                               output_dict=True,
                                               digits=3,
                                               mode='strict',
                                               scheme=IOB2)
        strict_results['performance'] = performance_measure(
            y_true=ner_ground_truth, y_pred=ner_predictions)
        strict_results = {
            metric_group1:
            {metric: float(value)
             for metric, value in metric_group2.items()}
            for metric_group1, metric_group2 in strict_results.items()
        }

        mlflow.log_dict(dict(lenient=default_results, strict=strict_results),
                        f"{partition}/{self.epoch}/{head_identifier}.json")
Example #8
0
def our_paper_arrg_to_aht(
    graph: nx.MultiDiGraph,
    max_number_of_nodes: int,
    weight: str = "weight",
    alpha_coefficient: float = 0.5,
    use_aspect_clustering: bool = False,
) -> nx.Graph:
    logger.info("Generate Aspect Hierarchical Tree based on ARRG")
    # aspects_rank = calculate_hits(graph)
    aspects_rank = nx.in_degree_centrality(graph)
    # aspects_rank = calculate_weighted_page_rank(graph, "weight")
    graph = calculate_weight(graph=graph,
                             ranks=aspects_rank,
                             alpha_coefficient=alpha_coefficient)
    graph_flatten = merge_multiedges(
        graph,
        node_attrib_name=weight,
        default_node_weight=0,
        n_clusters=max_number_of_nodes,
        use_aspect_clustering=use_aspect_clustering,
    )
    aspects_rank = [(aspect, score) for aspect, score in aspects_rank.items()
                    if aspect in graph_flatten]
    sorted_nodes = sorted(list(aspects_rank),
                          key=lambda node_value: node_value[1],
                          reverse=True)[:max_number_of_nodes]

    mlflow.log_dict(dict(sorted_nodes), "arrg_aspect_ranks.json")

    arrg_relation_weights = {
        f"{source}->{target}": data[weight]
        for source, target, data in graph_flatten.edges(data=True)
    }
    mlflow.log_dict(
        dict(
            sorted(arrg_relation_weights.items(),
                   key=lambda t: t[1],
                   reverse=True)),
        "arrg_relation_weights.json",
    )

    maximum_spanning_tree = nx.maximum_spanning_tree(graph_flatten,
                                                     weight=weight)
    nx.set_node_attributes(maximum_spanning_tree,
                           dict(graph_flatten.nodes.items()))
    return maximum_spanning_tree
Example #9
0
    def _generate_confusion_artifacts(
        self,
        artifact_dir: str,
        metadata: sequences.SequenceMetadata,
        model: models.BaseModel,
        test_dataset: tf.data.Dataset,
    ):
        prediction_output_calculator = analysis.PredictionOutputCalculator(
            metadata,
            model.prediction_model,
        )
        prediction_output_calculator.write_prediction_output_for_dataset(
            test_dataset,
            out_file_name=artifact_dir + "prediction_output.csv",
        )

        mlflow.log_dict(metadata.x_vocab, "x_vocab.json")
        mlflow.log_dict(metadata.y_vocab, "y_vocab.json")
Example #10
0
def run(features_select, target, run_name, experiment_id):
    with mlflow.start_run(run_name=run_name, experiment_id=experiment_id):
        X, Y = df_select[features_select], df_select[target]
        score = np.mean(cross_val_score(model, X, Y, cv=5))
        mlflow.log_metric("CV score", score)
        clf = model.fit(X, Y)
        feature_importance = sorted([[
            features_select[i],
            abs(np.sum(df_select[features_select[i]]) * clf.coef_[0][i])
        ] for i in range(len(features_select))],
                                    key=itemgetter(1),
                                    reverse=True)
        mlflow.sklearn.log_model(model, "Logistic Regression")
        mlflow.log_dict([[x[0], f"importance = {str(x[1])}"]
                         for x in feature_importance],
                        "feature_importance.yml")
        mlflow.end_run()
    return feature_importance
Example #11
0
    def _build_model(
        self,
        metadata: sequences.SequenceMetadata,
        base_knowledge: knowledge.BaseKnowledge,
        model: models.BaseModel,
    ) -> knowledge.BaseKnowledge:
        if (self.config.noise_to_add > 0 or self.config.noise_to_remove > 0
                or self.config.attention_noise_to_remove > 0):
            noise_knowledge = knowledge.NoiseKnowledge(base_knowledge)
            noise_knowledge.remove_lowest_connections(
                percentage=self.config.attention_noise_to_remove,
                connections_reference_file=self.config.
                attention_weight_reference_file,
            )
            noise_knowledge.add_random_connections(
                percentage=self.config.noise_to_add)
            noise_knowledge.remove_random_connections(
                percentage=self.config.noise_to_remove)

            mlflow.set_tag(
                "noise_type",
                "added{}_removed{}_threshold{}".format(
                    self.config.noise_to_add,
                    self.config.noise_to_remove,
                    self.config.attention_noise_to_remove,
                ),
            )
            (
                original_connections_text,
                noise_connections_text,
            ) = noise_knowledge.get_text_connections()
            mlflow.log_dict(
                original_connections_text,
                "original_knowledge.json",
            )
            mlflow.log_dict(
                noise_connections_text,
                "noise_knowledge.json",
            )
            model.build(metadata, noise_knowledge)
            return noise_knowledge
        model.build(metadata, base_knowledge)
        return base_knowledge
Example #12
0
def test_log_dict(subdir, extension):
    dictionary = {"k": "v"}
    filename = "data" + extension
    artifact_file = filename if subdir is None else posixpath.join(
        subdir, filename)

    with mlflow.start_run():
        mlflow.log_dict(dictionary, artifact_file)

        artifact_path = None if subdir is None else posixpath.normpath(subdir)
        artifact_uri = mlflow.get_artifact_uri(artifact_path)
        run_artifact_dir = local_file_uri_to_path(artifact_uri)
        assert os.listdir(run_artifact_dir) == [filename]

        filepath = os.path.join(run_artifact_dir, filename)
        extension = os.path.splitext(filename)[1]
        with open(filepath) as f:
            loaded = yaml.load(f) if (extension in [".yml", ".yaml"
                                                    ]) else json.load(f)
            assert loaded == dictionary
Example #13
0
def compile_model(model,
                  opt_name,
                  learning_rate,
                  strmetrics,
                  schedule_decay=1e-4):
    opt = getattr(tf.keras.optimizers, opt_name)(learning_rate=learning_rate,
                                                 schedule_decay=schedule_decay)

    metrics = []
    for m in strmetrics:
        if "TauLosses" in m: m = eval(m)
        metrics.append(m)
    model.compile(
        loss=None, optimizer=opt, metrics=metrics,
        weighted_metrics=metrics)  # loss is now defined in DeepTauModel

    # log metric names for passing them during model loading
    metric_names = {(m if isinstance(m, str) else m.__name__): ''
                    for m in metrics}
    mlflow.log_dict(metric_names, 'input_cfg/metric_names.json')
Example #14
0
    def _log_pretraining_metadata(estimator, params):

        if params and isinstance(params, dict):
            estimator = estimator.copy(params)

        autologging_metadata = _gen_estimator_metadata(estimator)

        artifact_dict = {}

        param_map = _get_instance_param_map(estimator, autologging_metadata.uid_to_indexed_name_map)
        if _should_log_hierarchy(estimator):
            artifact_dict["hierarchy"] = autologging_metadata.hierarchy

        for param_search_estimator in autologging_metadata.param_search_estimators:
            param_search_estimator_name = (
                f"{autologging_metadata.uid_to_indexed_name_map[param_search_estimator.uid]}"
            )
            artifact_dict[param_search_estimator_name] = {}

            artifact_dict[param_search_estimator_name][
                "tuning_parameter_map_list"
            ] = _get_tuning_param_maps(
                param_search_estimator, autologging_metadata.uid_to_indexed_name_map
            )

            artifact_dict[param_search_estimator_name][
                "tuned_estimator_parameter_map"
            ] = _get_instance_param_map_recursively(
                param_search_estimator.getEstimator(),
                1,
                autologging_metadata.uid_to_indexed_name_map,
            )

        if artifact_dict:
            mlflow.log_dict(artifact_dict, artifact_file="estimator_info.json")

        _log_estimator_params(param_map)

        mlflow.set_tags(_get_estimator_info_tags(estimator))
Example #15
0
def test_log_dict(subdir, extension):
    dictionary = {"k": "v"}
    filename = "data" + extension
    artifact_file = filename if subdir is None else posixpath.join(
        subdir, filename)

    with mlflow.start_run():
        mlflow.log_dict(dictionary, artifact_file)

        artifact_path = None if subdir is None else posixpath.normpath(subdir)
        artifact_uri = mlflow.get_artifact_uri(artifact_path)
        run_artifact_dir = local_file_uri_to_path(artifact_uri)
        assert os.listdir(run_artifact_dir) == [filename]

        filepath = os.path.join(run_artifact_dir, filename)
        extension = os.path.splitext(filename)[1]
        with open(filepath) as f:
            loaded = (
                # Specify `Loader` to suppress the following deprecation warning:
                # https://github.com/yaml/pyyaml/wiki/PyYAML-yaml.load(input)-Deprecation
                yaml.load(f, Loader=yaml.SafeLoader) if
                (extension in [".yml", ".yaml"]) else json.load(f))
            assert loaded == dictionary
Example #16
0
def _mlflow_log_params_dict(
    dictionary: Dict[str, Any],
    prefix: Optional[str] = None,
    log_type: Optional[str] = None,
    exclude: Optional[List[str]] = None,
):
    """The function of MLflow. Logs any value by its type from dictionary recursively.

    Args:
        dictionary: Values to log as dictionary.
        prefix: Prefix for parameter name (if the parameter is composite).
        log_type: The entity of logging (param, metric, artifact, image, etc.).
        exclude: Keys in the dictionary to exclude from logging.

    Raises:
        ValueError: If meets unknown type or log_type for logging in MLflow
            (add new case if needed).
    """
    for name, value in dictionary.items():
        if exclude is not None and name in exclude:
            continue

        name = re.sub(r"\W", "", name)
        name = f"{prefix}/{name}" if prefix else name

        if log_type == "dict":
            mlflow.log_dict(dictionary, name)
        elif isinstance(value, dict):
            _mlflow_log_params_dict(value, name, log_type, exclude)
        elif log_type == "param":
            try:
                mlflow.log_param(name, value)
            except mlflow.exceptions.MlflowException:
                continue
        else:
            raise ValueError(
                f"Unknown type of logging value: type({value})={type(value)}")
Example #17
0
def main(
    n_jobs: int,
    batch_size: int,
    aht_max_number_of_nodes: int,
    alpha_coefficient: float,
    experiment_id: Union[str, int],
    overwrite_neighborhood: bool,
    filter_graphs_to_intersected_vertices: bool,
):
    filter_graphs_to_intersected_vertices = bool(
        filter_graphs_to_intersected_vertices)
    for dataset_path, max_reviews in tqdm(
            datasets, desc="Amazon datasets processing..."):
        for experiment_name in [
                experiment_name_enum.GERANI,
                experiment_name_enum.OUR_ALL_RULES,
                experiment_name_enum.OUR_TOP_1_RULES,
        ]:

            with mlflow.start_run(
                    experiment_id=experiment_id,
                    run_name=
                    f"{experiment_name}-{dataset_path.stem}-{max_reviews}",
            ):
                mlflow.log_param("experiment_name", experiment_name)

                aspect_analysis = AspectAnalysis(
                    input_path=dataset_path.as_posix(),
                    output_path=settings.DEFAULT_OUTPUT_PATH /
                    dataset_path.stem,
                    experiment_name=experiment_name,
                    jobs=n_jobs,
                    batch_size=batch_size,
                    max_docs=max_reviews,
                    aht_max_number_of_nodes=aht_max_number_of_nodes,
                    alpha_coefficient=alpha_coefficient,
                )

                if experiment_name == experiment_name_enum.OUR_ALL_RULES:
                    aspect_analysis.our_pipeline()
                elif experiment_name == experiment_name_enum.GERANI:
                    aspect_analysis.gerani_pipeline()
                elif experiment_name == experiment_name_enum.OUR_TOP_1_RULES:
                    aspect_analysis.our_pipeline_top_n_rules_per_discourse_tree(
                    )
                else:
                    raise Exception("Wrong experiment type")

                for conceptnet_graph_path in tqdm(
                        CONCEPTNET_GRAPH_TOOL_GRAPHS,
                        desc="Conceptnet graph analysis..."):

                    with mlflow.start_run(
                            experiment_id=experiment_id,
                            run_name=conceptnet_graph_path.stem,
                            nested=True,
                            # run_id=f'{experiment_id}-{conceptnet_graph_path.stem}'
                    ) as run_conceptnet:

                        mlflow.log_param("dataset_path", dataset_path)
                        mlflow.log_param("dataset_name", dataset_path.stem)
                        mlflow.log_param("method", experiment_name)
                        mlflow.log_param("max_docs", max_reviews)
                        mlflow.log_param("batch_size", batch_size)
                        mlflow.log_param("n_jobs", n_jobs)
                        mlflow.log_param("conceptnet_graph_path",
                                         conceptnet_graph_path)
                        mlflow.log_param("conceptnet_graph_name",
                                         conceptnet_graph_path.stem)
                        mlflow.log_param("aht_max_number_of_nodes",
                                         aht_max_number_of_nodes)
                        mlflow.log_param("alpha_coefficient",
                                         alpha_coefficient)

                        png_file_path = (
                            aspect_analysis.paths.experiment_path /
                            f"shortest_paths_correlation_{conceptnet_graph_path.stem}.png"
                        )

                        if png_file_path.exists(
                        ) and not overwrite_neighborhood:
                            logger.info(
                                f"{png_file_path.as_posix()} has already exist, skipping to the next setting."
                            )
                            mlflow.log_artifact(png_file_path.as_posix())
                        else:
                            df = prepare_hierarchies_neighborhood(
                                experiments_path=aspect_analysis.paths,
                                conceptnet_graph_path=conceptnet_graph_path,
                                filter_graphs_to_intersected_vertices=
                                filter_graphs_to_intersected_vertices,
                            )

                            logger.info(
                                f"Shortest Paths pairs - data frame: {len(df)}"
                            )
                            df = df[~((df.shortest_distance_aspect_graph.
                                       isin(VALUES_TO_SKIP))
                                      | (df.shortest_distance_conceptnet.
                                         isin(VALUES_TO_SKIP)))]
                            df.drop_duplicates(subset=["aspect_1", "aspect_2"])
                            mlflow.log_metric("number_of_shortest_paths",
                                              len(df))
                            logger.info(
                                f"Shortest Paths pairs - data frame, without no paths and duplicates: {len(df)}"
                            )

                            mlflow.log_dict(
                                pd.DataFrame(
                                    df.shortest_distance_aspect_graph.
                                    value_counts()).to_dict(orient="index"),
                                "shortest_distance_aspect_graph_distribution.json",
                            )

                            mlflow.log_dict(
                                pd.DataFrame(
                                    df.shortest_distance_conceptnet.
                                    value_counts()).to_dict(orient="index"),
                                "shortest_distance_conceptnet_distribution.json",
                            )

                            df = df[df.shortest_distance_aspect_graph <= 6]

                            matplotlib.rc_file_defaults()
                            ax1 = sns.set_style(style=None, rc=None)
                            fig, ax1 = plt.subplots()
                            sns_plot = sns.lineplot(
                                x=df.shortest_distance_aspect_graph,
                                y=df.shortest_distance_conceptnet,
                                ax=ax1,
                            )
                            ax2 = ax1.twinx()
                            df_aspect_graph_distance_distribution = pd.DataFrame(
                                df.shortest_distance_aspect_graph.value_counts(
                                ))
                            df_aspect_graph_distance_distribution.reset_index(
                                inplace=True)
                            df_aspect_graph_distance_distribution.sort_values(
                                by="index", inplace=True)
                            sns.barplot(
                                x=df_aspect_graph_distance_distribution[
                                    "index"],
                                y=df_aspect_graph_distance_distribution.
                                shortest_distance_aspect_graph,
                                alpha=0.5,
                                ax=ax2,
                            )
                            logger.info(
                                f"Shortest Paths correlation figure will be saved in {png_file_path}"
                            )
                            df.sort_values(by="shortest_distance_conceptnet",
                                           inplace=True)
                            pearson_correlation = df.shortest_distance_aspect_graph.corr(
                                df.shortest_distance_conceptnet)
                            spearman_correlation = df.shortest_distance_aspect_graph.corr(
                                df.shortest_distance_conceptnet,
                                method="spearman")
                            kendall_correlation = df.shortest_distance_aspect_graph.corr(
                                df.shortest_distance_conceptnet,
                                method="kendall")
                            df_csv_path = (
                                aspect_analysis.paths.experiment_path /
                                "df.csv")
                            df.to_csv(df_csv_path.as_posix())
                            mlflow.log_artifact(df_csv_path.as_posix())
                            mlflow.log_metrics({
                                "pearson": pearson_correlation,
                                "spearman": spearman_correlation,
                                "kendall": kendall_correlation,
                            })
                            sns_plot.figure.savefig(png_file_path.as_posix())
                            plt.close()

                            mlflow.log_artifact(png_file_path.as_posix())
Example #18
0
    def _log_posttraining_metadata(estimator, spark_model, params, input_df):

        if _is_parameter_search_estimator(estimator):
            try:
                # Fetch environment-specific tags (e.g., user and source) to ensure that lineage
                # information is consistent with the parent run
                child_tags = context_registry.resolve_tags()
                child_tags.update(
                    {MLFLOW_AUTOLOGGING: AUTOLOGGING_INTEGRATION_NAME})
                _create_child_runs_for_parameter_search(
                    parent_estimator=estimator,
                    parent_model=spark_model,
                    parent_run=mlflow.active_run(),
                    child_tags=child_tags,
                )
            except Exception:
                import traceback

                msg = (
                    "Encountered exception during creation of child runs for parameter search."
                    " Child runs may be missing. Exception: {}".format(
                        traceback.format_exc()))
                _logger.warning(msg)

            estimator_param_maps = _get_tuning_param_maps(
                estimator,
                estimator._autologging_metadata.uid_to_indexed_name_map)

            metrics_dict, best_index = _get_param_search_metrics_and_best_index(
                estimator, spark_model)
            _log_parameter_search_results_as_artifact(
                estimator_param_maps, metrics_dict,
                mlflow.active_run().info.run_id)

            # Log best_param_map as JSON artifact
            best_param_map = estimator_param_maps[best_index]
            mlflow.log_dict(best_param_map,
                            artifact_file="best_parameters.json")

            # Log best_param_map as autologging parameters as well
            _log_estimator_params({
                f"best_{param_name}": param_value
                for param_name, param_value in best_param_map.items()
            })

        if log_models:
            if _should_log_model(spark_model):
                from mlflow.models import infer_signature
                from mlflow.pyspark.ml._autolog import (
                    cast_spark_df_with_vector_to_array,
                    get_feature_cols,
                )
                from mlflow.spark import _find_and_set_features_col_as_vector_if_needed
                from pyspark.sql import SparkSession

                spark = SparkSession.builder.getOrCreate()

                def _get_input_example_as_pd_df():
                    feature_cols = list(
                        get_feature_cols(input_df.schema, spark_model))
                    limited_input_df = input_df.select(feature_cols).limit(
                        INPUT_EXAMPLE_SAMPLE_ROWS)
                    return cast_spark_df_with_vector_to_array(
                        limited_input_df).toPandas()

                def _infer_model_signature(input_example_slice):
                    input_slice_df = _find_and_set_features_col_as_vector_if_needed(
                        spark.createDataFrame(input_example_slice),
                        spark_model)
                    model_output = spark_model.transform(input_slice_df).drop(
                        *input_slice_df.columns)
                    return infer_signature(input_example_slice,
                                           model_output.toPandas())

                input_example, signature = resolve_input_example_and_signature(
                    _get_input_example_as_pd_df,
                    _infer_model_signature,
                    log_input_examples,
                    log_model_signatures,
                    _logger,
                )

                mlflow.spark.log_model(
                    spark_model,
                    artifact_path="model",
                    registered_model_name=registered_model_name,
                    input_example=input_example,
                    signature=signature,
                )
                if _is_parameter_search_model(spark_model):
                    mlflow.spark.log_model(
                        spark_model.bestModel,
                        artifact_path="best_model",
                    )
            else:
                _logger.warning(
                    _get_warning_msg_for_skip_log_model(spark_model))
 def _log_percentile_mapping_to_mlflow(self):
     percentile_mapping = self._create_percentile_mapping()
     mlflow.log_dict(percentile_mapping, "percentile_mapping.json")
Example #20
0
def train_epochs(epochs, batch_size, token_size, hidden_size, embedding_size):

    # Read data

    x_train_full = open("../input/wili-2018/x_train.txt").read().splitlines()
    y_train_full = open("../input/wili-2018/y_train.txt").read().splitlines()

    x_test_full = open("../input/wili-2018/x_test.txt").read().splitlines()
    y_test_full = open("../input/wili-2018/y_test.txt").read().splitlines()

    # Get encoders

    char_vocab = Dictionary().char_dict(x_train_full)
    lang_vocab = Dictionary().lang_dict(y_train_full)

    # Convert data

    x_train_idx, y_train_idx = Encoder().encode_labeled_data(
        x_train_full,
        y_train_full,
        char_vocab,
        lang_vocab)
    x_test_idx, y_test_idx = Encoder().encode_labeled_data(
        x_test_full,
        y_test_full,
        char_vocab,
        lang_vocab)

    x_train, x_val, y_train, y_val = train_test_split(x_train_idx, y_train_idx, test_size=0.15)

    train_data = [(x, y) for x, y in zip(x_train, y_train)]
    val_data = [(x, y) for x, y in zip(x_val, y_val)]
    test_data = [(x, y) for x, y in zip(x_test_idx, y_test_idx)]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if not torch.cuda.is_available():
        logging.warning("WARNING: CUDA is not available.")

    criterion = torch.nn.CrossEntropyLoss(reduction='sum')

    bidirectional = False
    ntokens = len(char_vocab)
    nlabels = len(lang_vocab)
    pad_index = char_vocab.pad_index

    model, optimizer = get_model(
        ntokens,
        embedding_size,
        hidden_size,
        nlabels,
        bidirectional,
        pad_index,
        device)

    with mlflow.start_run():

        mlflow.log_metrics(
            {
                "train samples": len(train_data),
                "val samples": len(val_data),
                "test samples": len(test_data)
                }
            )

        mlflow.log_dict(lang_vocab.token2idx, "lang_vocab.json")
        mlflow.log_dict(char_vocab.token2idx, "char_vocab.json")
        params = {'epochs': epochs, 'batch_size': batch_size, 'token_size': token_size, 'hidden_size': hidden_size, 'embedding_size': embedding_size}
        mlflow.log_dict(params, "params.json")

        logging.info(f'Training cross-validation model for {epochs} epochs')

        for epoch in range(epochs):
            train_acc = train(model, optimizer, train_data, batch_size, token_size, criterion, device)
            logging.info(f'| epoch {epoch:02d} | train accuracy={train_acc:.1f}%')

            validate(model, val_data, batch_size, token_size, device, lang_vocab, tag='val', epoch=epoch)
            validate(model, test_data, batch_size, token_size, device, lang_vocab, tag='test', epoch=epoch)

            mlflow.pytorch.log_model(model, f'{epoch:02d}.model')

    mlflow.pytorch.log_model(model, 'model')
Example #21
0
config.update("jax_debug_nans", True)

# TODO: use **kwargs to reduce params

if __name__ == "__main__":
    problem = ModelContClassifier()
    problem = ProblemWraper(problem)

    with open(problem.HPARAMS_PATH, "r") as hfile:
        hparams = json.load(hfile)
    mlflow.set_tracking_uri(hparams['meta']["mlflow_uri"])
    mlflow.set_experiment(hparams['meta']["name"])

    with mlflow.start_run(run_name=hparams['meta']["method"] + "-" +
                          hparams["meta"]["optimizer"]) as run:
        mlflow.log_dict(hparams, artifact_file="hparams/hparams.json")
        mlflow.log_text("", artifact_file="output/_touch.txt")
        artifact_uri = mlflow.get_artifact_uri("output/")
        hparams["meta"]["output_dir"] = artifact_uri
        print(f"URI: {artifact_uri}")
        start_time = datetime.now()

        if hparams["n_perturbs"] > 1:
            for perturb in range(hparams["n_perturbs"]):
                print(f"Running perturb {perturb}")
                continuation = ContinuationCreator(
                    problem=problem, hparams=hparams,
                    key=perturb).get_continuation_method()
                continuation.run()
        else:
            continuation = ContinuationCreator(
Example #22
0
import pprint
import pandas_datareader
import pandas
from pandas_profiling import ProfileReport
import great_expectations as ge
from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler

if __name__ == "__main__":
    with mlflow.start_run(run_name="check_verify_data") as run:

        mlflow.set_tag("mlflow.runName", "check_verify_data")

        df = pandas.read_csv("./data/raw/data.csv")

        describe_to_dict = df.describe().to_dict()
        mlflow.log_dict(describe_to_dict, "describe_data.json")

        pd_df_ge = ge.from_pandas(df)

        assert pd_df_ge.expect_column_values_to_match_strftime_format(
            "Date", "%Y-%m-%d").success == True
        assert pd_df_ge.expect_column_values_to_be_of_type(
            "High", "float").success == True
        assert pd_df_ge.expect_column_values_to_be_of_type(
            "Low", "float").success == True
        assert pd_df_ge.expect_column_values_to_be_of_type(
            "Open", "float").success == True
        assert pd_df_ge.expect_column_values_to_be_of_type(
            "Close", "float").success == True
        assert pd_df_ge.expect_column_values_to_be_of_type(
            "Volume", "long").success == True
Example #23
0
import mlflow

if __name__ == '__main__':
    dictionary = {
        "conference": "Data + AI",
        "location": "virtual/global",
        "year": 2021,
        "theme": "Future is open"
    }
    with mlflow.start_run():
        # Log a dictionary as a JSON file under the run's root artifact directory
        mlflow.log_dict(dictionary, "data.json")

        # Log a dictionary as a YAML file in a subdirectory of the run's root artifact directory
        mlflow.log_dict(dictionary, "dir/data.yml")

        # If the file extension doesn't exist or match any of [".json", ".yaml", ".yml"],
        # JSON format is used.
        mlflow.log_dict(dictionary, "data.txt")
Example #24
0
# dump de modelo opcional. Afinal eles podem ser bem grandes e estourar armazenamento
if dict_args.pop('log_model'):
    print("Gonna LOG the model...")
    mlflow.sklearn.log_model(pbg, 'pbg_model_spacy')

# ## Vai fazer dump tema-topico

temas = json.load(open('temas.json'))
tema_topico = {}
topicos = pbg.get_topics(20)
for tema, topico in pbg.map_class_.items():
    if tema > 0:
        tema = str(tema)
        tema_topico[temas[tema]] = topicos[topico]
mlflow.log_dict(
    tema_topico,
    "tema_topico.json",
)

import dump_temas_e_topicos

tema_topico = dump_temas_e_topicos.get_tema_topico(pbg)
topicos_dict_sem_tema = dump_temas_e_topicos.get_topicos_sem_tema(pbg)

DUMP_FOLDER = 'tema_topico'
os.makedirs(DUMP_FOLDER, exist_ok=True)

tema_topico_path = f'{DUMP_FOLDER}/tema_topico_{RUN_ID}.json'
json.dump(tema_topico,
          open(tema_topico_path, 'w'),
          indent=4 * ' ',
          ensure_ascii=False)
    def _log_model_with_mlflow(
        self,
        model: OpenstfRegressor,
        experiment_name: str,
        model_type: str,
        model_specs: ModelSpecificationDataClass,
        report: Report,
        phase: str,
        **kwargs,
    ) -> None:
        """Log model with MLflow.
        Note: **kwargs has extra information to be logged with mlflow
        """
        # Get previous run id
        models_df = self._find_models(
            experiment_name, max_results=1
        )  # returns latest model
        if not models_df.empty:
            previous_run_id = models_df["run_id"][
                0
            ]  # Use [0] to only get latest run id
        else:
            self.logger.info(
                "No previous model found in MLflow", experiment_name=experiment_name
            )
            previous_run_id = None

        # Set tags to the run, can be used to filter on the UI
        mlflow.set_tag("run_id", mlflow.active_run().info.run_id)
        mlflow.set_tag("phase", phase)  # phase can be Training or Hyperparameter_opt
        mlflow.set_tag("Previous_version_id", previous_run_id)
        mlflow.set_tag("model_type", model_type)
        mlflow.set_tag("prediction_job", experiment_name)

        # Add feature names, target, feature modules, metrics and params to the run
        mlflow.set_tag(
            "feature_names", model_specs.feature_names[1:]
        )  # feature names are 1+ columns
        mlflow.set_tag("target", model_specs.feature_names[0])  # target is first column
        mlflow.set_tag("feature_modules", model_specs.feature_modules)
        mlflow.log_metrics(report.metrics)
        model_specs.hyper_params.update(model.get_params())
        mlflow.log_params(model_specs.hyper_params)

        # Process args
        for key, value in kwargs.items():
            if isinstance(value, dict):
                mlflow.log_dict(value, f"{key}.json")
            elif isinstance(value, str) or isinstance(value, int):
                mlflow.set_tag(key, value)
            else:
                self.logger.warning(
                    f"Couldn't log {key}, {type(key)} not supported",
                    experiment_name=experiment_name,
                )

        # Log the model to the run. Signature describes model input and output scheme
        mlflow.sklearn.log_model(
            sk_model=model, artifact_path="model", signature=report.signature
        )
        self.logger.info("Model saved with MLflow", experiment_name=experiment_name)
def generate_english_graph(
    graph_path: Union[str, Path],
    relation_types: Set[str] = None,
    synonymous_relations: Optional[List[str]] = None,
) -> gt.Graph:
    with mlflow.start_run(
            experiment_id=5,
            run_name=str(graph_path),
            nested=True,
    ):
        df = pd.read_csv(settings.CONCEPTNET_CSV_EN_PATH, index_col=0)
        relation_types = list(relation_types)
        mlflow.log_dict(
            {
                "relations": relation_types or "all",
                "synonymous_relations": synonymous_relations or "None",
                "all_relations": len(df)
            },
            "filter-relations.json",
        )

        if synonymous_relations:
            synonyms_df = df[df.relation.isin(synonymous_relations)]
            all_synonyms = list(
                set(synonyms_df.target.tolist() + synonyms_df.source.tolist()))

            synonyms = defaultdict(list)
            for s, t in tqdm(
                    zip(synonyms_df.source.tolist(),
                        synonyms_df.target.tolist()),
                    desc="Generating synonyms mapping",
                    total=len(synonyms_df),
            ):
                s = str(s)
                t = str(t)
                synonyms[s] += [t]
                synonyms[t] += [s]

            synonyms = {k: set(v).union({k}) for k, v in synonyms.items()}

        if relation_types is not None:
            df = df[df.relation.isin(relation_types)]

        mlflow.log_dict(
            {k: int(v)
             for k, v in df.relation.value_counts().items()}, "relations.json")

        g = gt.Graph()
        e_relation = g.new_edge_property("string")
        v_aspect_name = g.new_vertex_property("string")

        if synonymous_relations:
            all_vertices_names = set(df.source.tolist() + df.target.tolist() +
                                     all_synonyms)
        else:
            all_vertices_names = set(df.source.tolist() + df.target.tolist())

        vertices = {}
        for aspect_name in tqdm(all_vertices_names,
                                desc="Vertices adding to the graph..."):
            v = g.add_vertex()
            vertices[aspect_name] = v
            v_aspect_name[v] = aspect_name

        g.vertex_properties["aspect_name"] = v_aspect_name

        def get_synonyms(vertex_name) -> List[str]:
            return synonyms[vertex_name] if vertex_name in synonyms else [
                vertex_name
            ]

        edge_adding_errors = 0
        for row in tqdm(df.itertuples(),
                        desc="Edges adding to the graph...",
                        total=len(df)):
            source = row.source  # satellite
            target = row.target  # nucleus

            # if N->S the reverse
            if row.relation in NUCLEUS_SATELLITE_RELATIONS:
                source, target = target, source

            if synonymous_relations:
                for s, t in product(get_synonyms(source),
                                    get_synonyms(target)):
                    try:
                        e = g.add_edge(vertices[s], vertices[t])
                        e_relation[e] = row.relation
                    except KeyError:
                        edge_adding_errors += 1
            else:
                e = g.add_edge(vertices[source], vertices[target])
                e_relation[e] = row.relation

        print(f"{edge_adding_errors} edges with errors skipped")
        mlflow.log_metrics({
            "edge_adding_errors": edge_adding_errors,
            "n_edges": g.num_edges(),
            "n_vertices": g.num_vertices(),
        })
        g.edge_properties["relation"] = e_relation
        g.save(str(graph_path))

        return g
                        f1_train_norm = compute_f1(model, train_norm, 0)
                        f1_train_anom = compute_f1(model, train_anom, 1)
                        f1_test_norm = compute_f1(model, test_norm, 0)
                        f1_test_anom = compute_f1(model, test_anom, 1)

                        # log time
                        end_time = time.time()  # mark end
                        run_time = end_time - start_time  # calculate runtime based on fitting and scoring

                        # save metrics
                        idx = score_df.shape[0]
                        print(f'P{idx}, n-estimators: {n_estimators}, max-depth: {max_depth}, n-bins: {n_bins}, max-samples: {max_samples}, max-features: {max_features}')
                        score_df.loc[idx] = [n_estimators,max_depth,n_bins,max_samples,max_features,run_time,
                                             f1_train_norm,f1_train_anom,f1_test_norm,f1_test_anom]
    # log scoring dataframe
    mlflow.log_dict(score_df.to_dict(), 'score_df.json')
    
    # plot metrics
    ylabel_map = {'f1_train_norm':'F1-Score Training Normal', 'f1_train_anom':'F1-Score Training Anomaly',
                  'f1_test_norm':'F1-Score Testing Normal', 'f1_test_anom':'F1-Score Testing Anomaly', 'run_time': 'Run Time (Seconds)'}
    fig = plt.figure(figsize=[20,20])
    idx = 1  # counter for subplots
    for metric in ['f1_train_norm','f1_train_anom','f1_test_norm','f1_test_anom','run_time']:
        for param in ['n_estimators','max_depth','n_bins','max_samples','max_features']:
            # setup figure
            ax = fig.add_subplot(5, 5, idx)
            xs,ys = score_df[param], score_df[metric]
            # scatter
            ax.scatter(xs, ys)
            # fit line
            z = np.polyfit(xs, ys, 1)
Example #28
0
def main(cfg: DictConfig) -> None:
    # set up mlflow experiment id
    mlflow.set_tracking_uri(f"file://{to_absolute_path(cfg.path_to_mlflow)}")
    experiment = mlflow.get_experiment_by_name(cfg.experiment_name)

    if experiment is not None:
        run_kwargs = {'experiment_id': experiment.experiment_id}
        if cfg["pretrained"] is not None:  # initialise with pretrained run, otherwise create a new run
            run_kwargs['run_id'] = cfg["pretrained"]["run_id"]
    else:  # create new experiment
        experiment_id = mlflow.create_experiment(cfg.experiment_name)
        run_kwargs = {'experiment_id': experiment_id}

    # run the training with mlflow tracking
    with mlflow.start_run(**run_kwargs) as main_run:
        if cfg["pretrained"] is not None:
            mlflow.start_run(experiment_id=run_kwargs['experiment_id'],
                             nested=True)
        active_run = mlflow.active_run()
        run_id = active_run.info.run_id

        setup_gpu(cfg.gpu_cfg)
        training_cfg = OmegaConf.to_object(
            cfg.training_cfg)  # convert to python dictionary
        scaling_cfg = to_absolute_path(cfg.scaling_cfg)
        dataloader = DataLoader.DataLoader(training_cfg, scaling_cfg)
        setup = dataloader.config["SetupNN"]
        TauLosses.SetSFs(*setup["TauLossesSFs"])
        print("loss consts:", TauLosses.Le_sf, TauLosses.Lmu_sf,
              TauLosses.Ltau_sf, TauLosses.Ljet_sf)

        if setup["using_new_loss"]: tf.config.run_functions_eagerly(True)
        netConf_full = dataloader.get_net_config()

        if dataloader.input_type == "Adversarial":
            model = create_model(
                netConf_full,
                dataloader.model_name,
                loss=setup["loss"],
                use_newloss=setup["using_new_loss"],
                use_AdvDataset=True,
                adv_param=dataloader.adversarial_parameter,
                n_adv_tau=dataloader.adv_batch_size,
                adv_learning_rate=dataloader.adv_learning_rate)
        else:
            model = create_model(netConf_full,
                                 dataloader.model_name,
                                 loss=setup["loss"],
                                 use_newloss=setup["using_new_loss"])

        if cfg.pretrained is None:
            print(
                "Warning: no pretrained NN -> training will be started from scratch"
            )
            old_opt = None
        else:
            print("Warning: training will be started from pretrained model.")
            print(
                f"Model: run_id={cfg.pretrained.run_id}, experiment_id={cfg.pretrained.experiment_id}, model={cfg.pretrained.starting_model}"
            )

            path_to_pretrain = to_absolute_path(
                f'{cfg.path_to_mlflow}/{cfg.pretrained.experiment_id}/{cfg.pretrained.run_id}/artifacts/'
            )
            old_model = load_model(
                path_to_pretrain +
                f"/model_checkpoints/{cfg.pretrained.starting_model}",
                compile=False,
                custom_objects=None)
            for layer in model.layers:
                weights_found = False
                for old_layer in old_model.layers:
                    if layer.name == old_layer.name:
                        layer.set_weights(old_layer.get_weights())
                        weights_found = True
                        break
                if not weights_found:
                    print(f"Weights for layer '{layer.name}' not found.")
            old_opt = old_model.optimizer
            old_vars = [var.name for var in old_model.trainable_variables]

        compile_model(model, setup["optimizer_name"], setup["learning_rate"],
                      setup["metrics"], setup["schedule_decay"])
        fit_hist = run_training(model,
                                dataloader,
                                False,
                                cfg.log_suffix,
                                setup["using_new_loss"],
                                old_opt=old_opt)

        # log NN params
        for net_type in [
                'tau_net', 'comp_net', 'comp_merge_net', 'conv_2d_net',
                'dense_net'
        ]:
            mlflow.log_params({
                f'{net_type}_{k}': v
                for k, v in cfg.training_cfg.SetupNN[net_type].items()
            })
        mlflow.log_params({
            f'TauLossesSFs_{i}': v
            for i, v in enumerate(cfg.training_cfg.SetupNN.TauLossesSFs)
        })
        with open(
                to_absolute_path(
                    f'{cfg.path_to_mlflow}/{run_kwargs["experiment_id"]}/{run_id}/artifacts/model_summary.txt'
                )) as f:
            for l in f:
                if (s := 'Trainable params: ') in l:
                    mlflow.log_param('n_train_params',
                                     int(l.split(s)[-1].replace(',', '')))

        # log training related files
        mlflow.log_dict(training_cfg, 'input_cfg/training_cfg.yaml')
        mlflow.log_artifact(scaling_cfg, 'input_cfg')
        mlflow.log_artifact(to_absolute_path("Training_CNN.py"), 'input_cfg')
        mlflow.log_artifact(to_absolute_path("common.py"), 'input_cfg')

        # log hydra files
        mlflow.log_artifacts('.hydra', 'input_cfg/hydra')
        mlflow.log_artifact('Training_CNN.log', 'input_cfg/hydra')

        # log misc. info
        mlflow.log_param('run_id', run_id)
        mlflow.log_param('git_commit', _get_git_commit(to_absolute_path('.')))
        print(
            f'\nTraining has finished! Corresponding MLflow experiment name (ID): {cfg.experiment_name}({run_kwargs["experiment_id"]}), and run ID: {run_id}\n'
        )
        mlflow.end_run()

        # Temporary workaround to kill additional subprocesses that have not exited correctly
        try:
            current_process = psutil.Process()
            children = current_process.children(recursive=True)
            for child in children:
                child.kill()
        except:
            pass