def layer_conductance(net, test_input_tensor):
    """
    To use Layer Conductance, we create a LayerConductance object passing in the model as well as the module (layer) whose output we would like to understand.
    In this case, we choose net.sigmoid1, the output of the first hidden layer.
    Now obtain the conductance values for all the test examples by calling attribute on the LayerConductance object.
    LayerConductance also requires a target index for networks with mutliple outputs, defining the index of the output for which gradients are computed.
    Similar to feature attributions, we provide target = 1, corresponding to survival.
    LayerConductance also utilizes a baseline, but we simply use the default zero baseline as in integrated gradients.
    """

    cond = LayerConductance(net, net.sigmoid1)

    cond_vals = cond.attribute(test_input_tensor, target=1)
    cond_vals = cond_vals.detach().numpy()
    # We can begin by visualizing the average conductance for each neuron.
    neuron_names = ["neuron " + str(x) for x in range(12)]
    avg_neuron_imp, neuron_imp_dict = visualize_importances(
        neuron_names,
        np.mean(cond_vals, axis=0),
        title="Average Neuron Importances",
        axis_title="Neurons",
    )
    mlflow.log_metrics(neuron_imp_dict)
    mlflow.log_text(str(avg_neuron_imp), "neuron_imp_summary.txt")
    # We can also look at the distribution of each neuron's attributions. Below we look at the distributions for neurons 7 and 9,
    # and we can confirm that their attribution distributions are very close to 0, suggesting they are not learning substantial features.
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(9, 6))
    fig.tight_layout(pad=3)
    ax1.hist(cond_vals[:, 9], 100)
    ax1.set(title="Neuron 9 Distribution")
    ax2.hist(cond_vals[:, 7], 100)
    ax2.set(title="Neuron 7 Distribution")
    mlflow.log_figure(fig, "Neurons_Distribution.png")
def neuron_conductance(net, test_input_tensor, neuron_selector=None):
    """
    We have identified that some of the neurons are not learning important features, while others are.
    Can we now understand what each of these important neurons are looking at in the input?
    For instance, are they identifying different features in the input or similar ones?

    To answer these questions, we can apply the third type of attributions available in Captum, **Neuron Attributions**.
    This allows us to understand what parts of the input contribute to activating a particular input neuron. For this example,
    we will apply Neuron Conductance, which divides the neuron's total conductance value into the contribution from each individual input feature.

    To use Neuron Conductance, we create a NeuronConductance object, analogously to Conductance,
    passing in the model as well as the module (layer) whose output we would like to understand, in this case, net.sigmoid1, as before.
    """
    neuron_selector = 0
    neuron_cond = NeuronConductance(net, net.sigmoid1)

    # We can now obtain the neuron conductance values for all the test examples by calling attribute on the NeuronConductance object.
    # Neuron Conductance requires the neuron index in the target layer for which attributions are requested as well as the target index for networks with mutliple outputs,
    # similar to layer conductance. As before, we provide target = 1, corresponding to survival, and compute neuron conductance for neurons 0 and 10, the significant neurons identified above.
    # The neuron index can be provided either as a tuple or as just an integer if the layer output is 1-dimensional.

    neuron_cond_vals = neuron_cond.attribute(test_input_tensor,
                                             neuron_selector=neuron_selector,
                                             target=1)
    neuron_cond, _ = visualize_importances(
        feature_names,
        neuron_cond_vals.mean(dim=0).detach().numpy(),
        title="Average Feature Importances for Neuron {}".format(
            neuron_selector),
    )
    mlflow.log_text(
        str(neuron_cond),
        "Avg_Feature_Importances_Neuron_" + str(neuron_selector) + ".txt")
Beispiel #3
0
def feature_conductance(test_input_tensor):
    ig = IntegratedGradients(net)
    test_input_tensor.requires_grad_()
    attr, _ = ig.attribute(test_input_tensor, target=1, return_convergence_delta=True)
    attr = attr.detach().numpy()
    # To understand these attributions, we can first average them across all the inputs
    # and print and visualize the average attribution for each feature.
    feature_imp, feature_imp_dict = visualize_importances(feature_names, np.mean(attr, axis=0))
    mlflow.log_metrics(feature_imp_dict)
    mlflow.log_text(str(feature_imp), "feature_imp_summary.txt")
    fig, (ax1, ax2) = plt.subplots(2, 1)
    fig.tight_layout(pad=3)
    ax1.hist(attr[:, 1], 100)
    ax1.set(title="Distribution of Sibsp Attribution Values")

    # we can bucket the examples by the value of the sibsp feature and
    # plot the average attribution for the feature.
    # In the plot below, the size of the dot is proportional to
    # the number of examples with that value.

    bin_means, bin_edges, _ = stats.binned_statistic(
        test_features[:, 1], attr[:, 1], statistic="mean", bins=6
    )
    bin_count, _, _ = stats.binned_statistic(
        test_features[:, 1], attr[:, 1], statistic="count", bins=6
    )

    bin_width = bin_edges[1] - bin_edges[0]
    bin_centers = bin_edges[1:] - bin_width / 2
    ax2.scatter(bin_centers, bin_means, s=bin_count)
    ax2.set(xlabel="Average Sibsp Feature Value", ylabel="Average Attribution")
    mlflow.log_figure(fig, "Average_Sibsp_Feature_Value.png")
Beispiel #4
0
    def on_train_begin(self, logs=None):  # pylint: disable=unused-argument
        config = self.model.optimizer.get_config()
        for attribute in config:
            mlflow.log_param("opt_" + attribute, config[attribute])

        sum_list = []
        self.model.summary(print_fn=sum_list.append)
        summary = "\n".join(sum_list)
        mlflow.log_text(summary, artifact_file="model_summary.txt")
Beispiel #5
0
def train_model(model, X_train, y_train, name, config):
    """train
    train a single model.

    # Arguments
        model: Model, NN model to train.
        X_train: ndarray(number, lags), Input data for train.
        y_train: ndarray(number, ), result data for train.
        name: String, name of model.
        config: Dict, parameter for train.
    """
    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    tracking_uri = mlflow.get_tracking_uri()
    print("Current tracking uri: {}".format(tracking_uri))

    tags = {
        "usuario": "Anonymous"
    }
    
    mlflow.set_experiment("traffic_flow-saes")    
    with mlflow.start_run() as run:
        mlflow.set_tags(tags)
        mlflow.keras.autolog()
        
        model.compile(loss="mse", optimizer="rmsprop", metrics=['mape'])
        #early = EarlyStopping(monitor='val_loss', patience=30, verbose=0, mode='auto')
        hist = model.fit(
            X_train, y_train,
            batch_size=config["batch"],
            epochs=config["epochs"],
            validation_split=0.05)

        model.save('model/' + name + '.h5')
        df = pd.DataFrame.from_dict(hist.history)
        df.to_csv('model/' + name + ' loss.csv', encoding='utf-8', index=False)
        
        mlflow.log_param("Run_id", run.info.run_id)

        """ Save dvc as parameter """
        """
        with open('/home/VICOMTECH/icejudo/PROYECTOS/MiRepoGithub/dvc_repo/data.dvc', 'r') as file:
            data_version = file.read().replace('\n', '')
        mlflow.log_param('Data Version dvc', data_version)
        """
        """ Save dvc as artifact """
        
        with open('/home/VICOMTECH/icejudo/PROYECTOS/MiRepoGithub/dvc_repo/data.dvc', 'r') as file:
            data_version = file.read()
        mlflow.log_text(data_version, "data_version.txt")
Beispiel #6
0
    def on_train_begin(self, logs=None):  # pylint: disable=unused-argument
        config = self.model.optimizer.get_config()
        for attribute in config:
            mlflow.log_param("opt_" + attribute, config[attribute])

        sum_list = []
        try:
            self.model.summary(print_fn=sum_list.append)
            summary = "\n".join(sum_list)
            mlflow.log_text(summary, artifact_file="model_summary.txt")
        except ValueError as ex:
            if "This model has not yet been built" in str(ex):
                warnings.warn(str(ex))
            else:
                raise ex
Beispiel #7
0
def test_log_text(subdir):
    filename = "file.txt"
    text = "a"
    artifact_file = filename if subdir is None else posixpath.join(subdir, filename)

    with mlflow.start_run():
        mlflow.log_text(text, artifact_file)

        artifact_path = None if subdir is None else posixpath.normpath(subdir)
        artifact_uri = mlflow.get_artifact_uri(artifact_path)
        run_artifact_dir = local_file_uri_to_path(artifact_uri)
        assert os.listdir(run_artifact_dir) == [filename]

        filepath = os.path.join(run_artifact_dir, filename)
        with open(filepath) as f:
            assert f.read() == text
Beispiel #8
0
    def wrapper_fit(original, self, *args, **kwargs):

        should_autolog = False
        if AutologHelpers.should_autolog:
            AutologHelpers.should_autolog = False
            should_autolog = True

        try:
            if should_autolog:
                # This may generate warnings due to collisions in already-logged param names
                log_fn_args_as_params(original, args, kwargs)

            # training model
            model = original(self, *args, **kwargs)

            if should_autolog:
                # Log the model
                if get_autologging_config(FLAVOR_NAME, "log_models", True):
                    global _save_model_called_from_autolog
                    _save_model_called_from_autolog = True
                    registered_model_name = get_autologging_config(
                        FLAVOR_NAME, "registered_model_name", None
                    )
                    try:
                        log_model(
                            model,
                            artifact_path="model",
                            registered_model_name=registered_model_name,
                        )
                    finally:
                        _save_model_called_from_autolog = False

                # Log the most common metrics
                if isinstance(model, statsmodels.base.wrapper.ResultsWrapper):
                    metrics_dict = _get_autolog_metrics(model)
                    mlflow.log_metrics(metrics_dict)

                    model_summary = model.summary().as_text()
                    mlflow.log_text(model_summary, "model_summary.txt")

            return model

        finally:
            # Clean the shared flag for future calls in case it had been set here ...
            if should_autolog:
                AutologHelpers.should_autolog = True
Beispiel #9
0
def train(USE_PRETRAINED_MODEL=False):
    net = TitanicSimpleNNModel()
    train_features, train_labels, test_features, test_labels, feature_names = prepare()
    USE_PRETRAINED_MODEL = dict_args["use_pretrained_model"]
    if USE_PRETRAINED_MODEL:
        net.load_state_dict(torch.load("models/titanic_state_dict.pt"))
        net.eval()
        print("Model Loaded!")
    else:
        criterion = nn.CrossEntropyLoss()
        num_epochs = dict_args["max_epochs"]
        mlflow.log_param("epochs", num_epochs)
        mlflow.log_param("lr", dict_args["lr"])

        optimizer = torch.optim.Adam(net.parameters(), lr=dict_args["lr"])
        input_tensor = torch.from_numpy(train_features).type(torch.FloatTensor)
        label_tensor = torch.from_numpy(train_labels)
        for epoch in range(num_epochs):
            output = net(input_tensor)
            loss = criterion(output, label_tensor)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if epoch % 50 == 0:
                print(
                    "Epoch {}/{} => Train Loss: {:.2f}".format(epoch + 1, num_epochs, loss.item())
                )
                mlflow.log_metric(
                    "Epoch {} Loss".format(str(epoch + 1)),
                    float(loss.item()),
                    step=epoch,
                )
        if not os.path.isdir("models"):
            os.makedirs("models")
            torch.save(net.state_dict(), "models/titanic_state_dict.pt")
    summary, _ = count_model_parameters(net)
    mlflow.log_text(str(summary), "model_summary.txt")
    return (
        net,
        train_features,
        train_labels,
        test_features,
        test_labels,
        feature_names,
    )
def patched_fit(original, self, *args, **kwargs):
    run_id = mlflow.active_run().info.run_id
    tracking_uri = mlflow.get_tracking_uri()
    client = MlflowAutologgingQueueingClient(tracking_uri)
    metrics_logger = BatchMetricsLogger(run_id, tracking_uri)

    log_models = get_autologging_config(mlflow.paddle.FLAVOR_NAME,
                                        "log_models", True)
    log_every_n_epoch = get_autologging_config(mlflow.paddle.FLAVOR_NAME,
                                               "log_every_n_epoch", 1)

    early_stop_callback = None
    mlflow_callback = __MLflowPaddleCallback(client, metrics_logger, run_id,
                                             log_models, log_every_n_epoch)
    if "callbacks" in kwargs:
        callbacks = kwargs["callbacks"]
        for callback in callbacks:
            if isinstance(callback, paddle.callbacks.EarlyStopping):
                early_stop_callback = callback
                _log_early_stop_params(early_stop_callback, client, run_id)
                break
        kwargs["callbacks"].append(mlflow_callback)
    else:
        kwargs["callbacks"] = [mlflow_callback]
    client.flush(synchronous=False)

    result = original(self, *args, **kwargs)

    if early_stop_callback is not None:
        _log_early_stop_metrics(early_stop_callback, client, run_id)

    mlflow.log_text(str(self.summary()), "model_summary.txt")

    if log_models:
        mlflow.paddle.log_model(pd_model=self, artifact_path="model")

    client.flush(synchronous=True)

    return result
Beispiel #11
0
    def train_dataset(
        self,
        train_dataset: tf.data.Dataset,
        test_dataset: tf.data.Dataset,
        multilabel_classification: bool,
        n_epochs: int,
    ):
        with self.strategy.scope():
            if self.metadata.full_y_prediction:
                self._compile_full_prediction(train_dataset)
            elif len(self.metadata.y_vocab) == 1:
                self._compile_singleclass()
            elif multilabel_classification:
                self._compile_multilabel(train_dataset)
            else:
                self._compile_multiclass(train_dataset)

            model_summary = []
            self.prediction_model.summary(
                print_fn=lambda x: model_summary.append(x))
            mlflow.log_text("\n".join(model_summary),
                            artifact_file="model_summary.txt")

            self.history = self.prediction_model.fit(
                train_dataset,
                validation_data=test_dataset,
                epochs=n_epochs,
                callbacks=[
                    MLFlowCallback(),
                    BestModelRestoreCallback(
                        metric=self.config.best_model_metric,
                        minimize=self.config.best_model_metric_minimize,
                        early_stopping_epochs=self.config.
                        early_stopping_epochs,
                    ),
                ],
            )
def aloha_workflow(base_dir,  # base tool path
                   use_cache=1,  # whether to skip already executed runs (in cache) or not (1/0)
                   ignore_git=0):  # whether to ignore git version or not (1/0)
    """ ALOHA MLflow workflow.

    Args:
        base_dir: Base tool path
        use_cache: Whether to skip already executed runs (in cache) or not (1/0) (default: 1)
        ignore_git: Whether to ignore git version or not (1/0) (default: 0)
    """

    # get some needed variables from config file
    runs = int(config['general']['runs'])
    workers = int(config['general']['workers'])

    batch_size = int(config['aloha']['batch_size'])
    epochs = int(config['aloha']['epochs'])
    use_malicious_labels = int(config['aloha']['use_malicious_labels'])
    use_count_labels = int(config['aloha']['use_count_labels'])
    use_tag_labels = int(config['aloha']['use_tag_labels'])
    gen_type = config['aloha']['gen_type']
    net_type = 'aloha'

    training_n_samples = int(config['sorel20mDataset']['training_n_samples'])
    validation_n_samples = int(config['sorel20mDataset']['validation_n_samples'])
    test_n_samples = int(config['sorel20mDataset']['test_n_samples'])

    min_n_anchor_samples = int(config['freshDataset']['min_n_anchor_samples'])
    max_n_anchor_samples = int(config['freshDataset']['max_n_anchor_samples'])
    fresh_n_queries = int(config['freshDataset']['n_queries'])
    n_evaluations = int(config['freshDataset']['n_evaluations'])

    # initialize Hash object
    ch = Hash()

    # update hash with the content of the config file (for the current net type)
    ch.update(json.dumps(dict(config.items('sorel20mDataset'))))
    # get config file sha256 digest
    dataset_config_sha = ch.get_b64()

    # update hash with the content of the config file (for the current net type)
    ch.update(json.dumps(dict(config.items(net_type))))
    # get config file sha256 digest
    config_sha = ch.get_b64()

    # update hash with the content of the config file (for the freshDataset)
    ch.update(json.dumps(dict(config.items('freshDataset'))))
    # get config file sha256 digest
    fresh_dataset_config_sha = ch.get_b64()

    # instantiate key-n_samples dict
    n_samples_dict = {'train': training_n_samples,
                      'validation': validation_n_samples,
                      'test': test_n_samples}

    # Note: The entrypoint names are defined in MLproject. The artifact directories
    # are documented by each step's .py file.

    # start mlflow run
    with mlflow.start_run() as active_run:
        # get code git commit version
        git_commit = active_run.data.tags.get(mlflow_tags.MLFLOW_GIT_COMMIT)

        # log config file
        mlflow.log_text(json.dumps({s: dict(config.items(s)) for s in config.sections()}), 'config.txt')

        # set dataset destination dir
        dataset_dir = os.path.join(base_dir, 'dataset')
        # set dataset base path (directory containing 'meta.db')
        dataset_base_path = os.path.join(dataset_dir, '09-DEC-2020', 'processed-data')
        # set pre-processed dataset base path (directory containing .dat files)
        pre_processed_dataset_dir = os.path.join(dataset_dir, '09-DEC-2020', 'pre-processed_dataset')
        # set fresh dataset base path (directory containing .dat files)
        fresh_dataset_dir = os.path.join(dataset_dir, 'fresh_dataset')

        # if pre-processed dataset files for this run parameters are not present, generate them
        if not preproc_check_files(destination_dir=pre_processed_dataset_dir,
                                   n_samples_dict=n_samples_dict):
            logger.info("Pre-processed dataset not found.")

            # if the original Sorel20M dataset is not present, download it
            if not download_check_files(dataset_dir):
                logger.info("Dataset not found.")

                # run dataset downloader
                download_dataset_run = run("download_dataset", {
                    'destination_dir': dataset_dir
                }, config_sha=dataset_config_sha)

            # pre-process dataset
            preprocess_dataset_run = run("preprocess_dataset", {
                'ds_path': dataset_base_path,
                'destination_dir': pre_processed_dataset_dir,
                'training_n_samples': training_n_samples,
                'validation_n_samples': validation_n_samples,
                'test_n_samples': test_n_samples,
                'batch_size': batch_size,
                'remove_missing_features': str(os.path.join(dataset_base_path, "shas_missing_ember_features.json"))
            }, config_sha=dataset_config_sha)

        # if the fresh dataset is not present, generate it
        if not fresh_check_files(fresh_dataset_dir):
            logger.info("Fresh dataset not found.")

            # generate fresh dataset
            build_fresh_dataset_run = run("build_fresh_dataset", {
                'dataset_dest_dir': fresh_dataset_dir
            }, config_sha=fresh_dataset_config_sha)

        results_files = {}

        # instantiate common (between consecutive training runs) training parameters
        common_training_params = {
            'ds_path': pre_processed_dataset_dir,
            'net_type': net_type,
            'gen_type': gen_type,
            'batch_size': batch_size,
            'epochs': epochs,
            'training_n_samples': training_n_samples,
            'validation_n_samples': validation_n_samples,
            'use_malicious_labels': use_malicious_labels,
            'use_count_labels': use_count_labels,
            'use_tag_labels': use_tag_labels,
            'workers': workers
        }

        # instantiate common (between consecutive training runs) evaluation parameters
        common_evaluation_params = {
            'ds_path': pre_processed_dataset_dir,
            'net_type': net_type,
            'gen_type': gen_type,
            'batch_size': batch_size,
            'test_n_samples': test_n_samples,
            'evaluate_malware': use_malicious_labels,
            'evaluate_count': use_count_labels
        }

        # for each training run
        for training_run_id in range(runs):
            logger.info("initiating training run n. {}".format(str(training_run_id)))

            # set training parameters
            training_params = common_training_params
            training_params.update({'training_run': training_run_id})

            # train network (get or run) on Sorel20M dataset
            training_run = get_or_run("train_network",
                                      training_params,
                                      git_commit,
                                      ignore_git=bool(ignore_git),
                                      use_cache=bool(use_cache),
                                      resume=True,
                                      config_sha=config_sha)

            # get model checkpoints path
            checkpoint_path = parse.unquote(parse.urlparse(os.path.join(training_run.info.artifact_uri,
                                                                        "model_checkpoints")).path)

            # set model checkpoint filename
            checkpoint_file = os.path.join(checkpoint_path, "epoch_{}.pt".format(epochs))

            # set evaluation parameters
            evaluation_params = common_evaluation_params
            evaluation_params.update({'checkpoint_file': checkpoint_file})

            # evaluate model against Sorel20M dataset
            evaluation_run = get_or_run("evaluate_network",
                                        evaluation_params,
                                        git_commit,
                                        ignore_git=bool(ignore_git),
                                        use_cache=bool(use_cache),
                                        config_sha=config_sha)

            # get model evaluation results path
            results_path = parse.unquote(parse.urlparse(os.path.join(evaluation_run.info.artifact_uri,
                                                                     "model_results")).path)

            # set model evaluation results filename
            results_file = os.path.join(results_path, "results.csv")

            # add file path to results_files dictionary (used for plotting mean results)
            results_files["run_id_" + str(training_run_id)] = results_file

            # compute (and plot) all tagging results
            all_tagging_results_run = get_or_run("compute_all_run_results", {
                'results_file': results_file,
                'use_malicious_labels': use_malicious_labels,
                'use_tag_labels': use_tag_labels
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=config_sha)

            # evaluate model against fresh dataset
            fresh_evaluation_run = get_or_run("evaluate_fresh", {
                'fresh_ds_path': fresh_dataset_dir,
                'checkpoint_path': checkpoint_file,
                'net_type': net_type,
                'min_n_anchor_samples': min_n_anchor_samples,
                'max_n_anchor_samples': max_n_anchor_samples,
                'n_query_samples': fresh_n_queries,
                'n_evaluations': n_evaluations
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=fresh_dataset_config_sha)

            # get model evaluation results path
            fresh_results_path = parse.unquote(parse.urlparse(os.path.join(fresh_evaluation_run.info.artifact_uri,
                                                                           "fresh_prediction_results")).path)

            # set model evaluation results filename
            fresh_results_file = os.path.join(fresh_results_path, "fresh_prediction_results.json")

            # compute (and plot) all family prediction results (on fresh dataset)
            all_tagging_results_run = get_or_run("compute_all_run_fresh_results", {
                'results_file': fresh_results_file
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=fresh_dataset_config_sha)

        # get overall parameters set (without duplicates and sorted)
        params_set = [str(p) for p in common_training_params.values()]
        params_set.extend([str(p) for p in common_evaluation_params.values() if str(p) not in params_set])
        params_set.sort()

        # instantiate hash
        h = Hash()
        # for each param in the parameters set, update the hash value
        for param in params_set:
            h.update(str(param))

        # create temp dir name using the value from the hash object.
        # -> This is done in order to have a different (but predictable) run_to_filename at each different run.
        # This in turn means that mlflow knows when it is needed to run 'per_tag_plot_runs'.
        tempdir = os.path.join(base_dir, 'tmp_{}'.format(h.get_b64()))
        os.makedirs(tempdir, exist_ok=True)

        # set run-to-filename file path
        run_to_filename = os.path.join(tempdir, "results.json")

        # create and open the results.json file in write mode
        with open(run_to_filename, "w") as output_file:
            # save results_files dictionary as a json file
            json.dump(results_files, output_file)

        # log run-to-filename
        mlflow.log_artifact(run_to_filename, "run_to_filename")

        # if there is more than 1 run, compute also per-tag mean results
        if runs > 1:
            # plot all roc distributions
            per_tag_plot_runs = get_or_run("plot_all_roc_distributions", {
                'run_to_filename_json': run_to_filename,
                'use_malicious_labels': use_malicious_labels,
                'use_tag_labels': use_tag_labels
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=config_sha)

        # remove temporary directory and run_to_filename file
        os.remove(run_to_filename)
        os.rmdir(tempdir)
Beispiel #13
0
# TODO: use **kwargs to reduce params

if __name__ == "__main__":
    problem = ModelContClassifier()
    problem = ProblemWraper(problem)

    with open(problem.HPARAMS_PATH, "r") as hfile:
        hparams = json.load(hfile)
    mlflow.set_tracking_uri(hparams['meta']["mlflow_uri"])
    mlflow.set_experiment(hparams['meta']["name"])

    with mlflow.start_run(run_name=hparams['meta']["method"] + "-" +
                          hparams["meta"]["optimizer"]) as run:
        mlflow.log_dict(hparams, artifact_file="hparams/hparams.json")
        mlflow.log_text("", artifact_file="output/_touch.txt")
        artifact_uri = mlflow.get_artifact_uri("output/")
        hparams["meta"]["output_dir"] = artifact_uri
        print(f"URI: {artifact_uri}")
        start_time = datetime.now()

        if hparams["n_perturbs"] > 1:
            for perturb in range(hparams["n_perturbs"]):
                print(f"Running perturb {perturb}")
                continuation = ContinuationCreator(
                    problem=problem, hparams=hparams,
                    key=perturb).get_continuation_method()
                continuation.run()
        else:
            continuation = ContinuationCreator(
                problem=problem, hparams=hparams).get_continuation_method()
Beispiel #14
0
# if running in a notebook, uncomment these 2 lines
# import sys
# sys.argv = ['']

parser = argparse.ArgumentParser()
parser.add_argument("--nyc_taxi_dataset", default='AZURE_ML_INPUT0')
args = parser.parse_args()
dataset = args.nyc_taxi_dataset
print(f"dataset location: {dataset}")
os.system(f"find {dataset}")

# give other nodes some time to connect before we show the cluster setup
time.sleep(3)
c = Client("localhost:8786")
print(c)
mlflow.log_text(str(c), "dask_cluster")

# read in the data from the provided file dataset (which is mounted at the same
# location on all nodes of the job)
df = dd.read_csv(f'{dataset}/nyctaxi/*.csv',
                 parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

# as an alternative, the below would be using abfs
# but without making use of the dataset that was provided
# instead it is pulling the data from a hard-code path on the
# default_datastore
#
# from azureml.core import Run
# run = Run.get_context()
# ws = run.experiment.workspace
#
Beispiel #15
0
        mlflow.sklearn.log_model(lr, "model")
        
        if is_test:
            import json

            # Create some files to preserve as artifacts
            features = "rooms, zipcode, median_price, school_rating, transport"
            data = {"state": "TX", "Available": 25, "Type": "Detached"}

            # Create couple of artifact files under the directory "data"
            os.makedirs("data", exist_ok=True)
            with open("data/data.json", 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2)
            with open("data/features.txt", 'w') as f:
                f.write(features)
            
            mlflow.log_artifacts("data", artifact_path="states")
            
            mlflow.log_artifact(wine_path)
            
            mlflow.log_params({"alpha":alpha, "l1_ratio":l1_ratio})
            mlflow.log_metrics({"rmse":rmse, "r2":r2, "mae":mae})
            
            mlflow.log_text("text test", "testtext.txt")
            
    if is_test:
        model_uri = "runs:/{}/model".format(run.info.run_id)
        mv = mlflow.register_model(model_uri, "ElasticNetRegressionModel")
        print("Name: {}".format(mv.name))
        print("Version: {}".format(mv.version))
# blob_sas_token = r""

# # # Allow SPARK to read from Blob remotely
# wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path)
# spark.conf.set(
#   'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
#   blob_sas_token)
# print('Remote blob path: ' + wasbs_path)

# SPARK read parquet, note that it won't load any data yet by now
# df = spark.read.parquet(wasbs_path)
df = spark.read.parquet(dataset)
# df.show()

print(df.head())
mlflow.log_text(str(df.head()), "df.head")

# create a list of columns & dtypes the df must have
must_haves = {
    "vendorID": "string",
    "pickupDatetime": "datetime",
    "dropoffDatetime": "datetime",
    "passengerCount": "int",
    "tripDistance": "double",
    "startLon": "double",
    "startLat": "double",
    "rateCode": "int",
    "paymentType": "int",
    "endLon": "double",
    "endLat": "double",
    "fareAmount": "double",
Beispiel #17
0
import mlflow

if __name__ == '__main__':

    with mlflow.start_run():

        # Log HTML text
        mlflow.log_text("<a href=https://mlflow.org>MLflow Website</a>",
                        "mlflow.html")
Beispiel #18
0
def sorel20m_download(
    destination_dir
):  # path to the destination folder where to save the element to
    """ Download SOREL20M dataset elements from the s3 socket and save them in the specified destination directory.

    Args:
        destination_dir: Path to the destination folder where to save the element to
    """

    # get absolute path if the one provided is relative
    if not os.path.isabs(destination_dir):
        destination_dir = os.path.abspath(destination_dir)

    # create destination dir if it does not already exist
    os.makedirs(destination_dir, exist_ok=True)

    # start mlflow run
    with mlflow.start_run():
        # get dataset base absolute path
        dataset_base_path = os.path.dirname(
            os.path.join(destination_dir, needed_objects['meta']))

        # log dataset base path
        mlflow.log_text(dataset_base_path,
                        artifact_file="dataset_base_path.txt")

        # check if all the needed files were already downloaded, if yes return
        if check_files(destination_dir=destination_dir):
            logger.info("Found already downloaded dataset..")
            return

        # set SOREL20M bucket name
        bucket_name = "sorel-20m"

        # instantiate bucket file downloader setting the destination dir and bucket name
        downloader = BucketFileDownloader(destination_dir, bucket_name)

        # select just the objects not already present from needed objects
        objects_to_download = {
            key: obj
            for key, obj in needed_objects.items()
            if not os.path.exists(os.path.join(destination_dir, obj))
        }

        # for all objects to download
        for i, (key, obj) in enumerate(objects_to_download.items()):
            if key == 'missing':
                # download shas_missing_ember_features.json
                r = requests.get(missing_url, allow_redirects=True)

                # compute filename
                filename = os.path.join(destination_dir, obj)

                # save downloaded file
                with open(filename, 'wb') as f:
                    f.write(r.content)

            else:
                # download object (and save it in destination_dir)
                downloader(obj)

            logger.info("{}/{} done.".format(i + 1, len(objects_to_download)))
def contrastive_learning_only(base_dir,  # base tool path
                              use_cache=1,  # whether to skip already executed runs (in cache) or not (1/0)
                              ignore_git=0):  # whether to ignore git version or not (1/0)
    """ Contrastive Learning Only (with no Transfer Learning) workflow.

    Args:
        base_dir: Base tool path
        use_cache: Whether to skip already executed runs (in cache) or not (1/0) (default: 1)
        ignore_git: Whether to ignore git version or not (1/0) (default: 0)
    """

    # get some needed variables from config file
    runs = int(config['general']['runs'])
    net_type = 'mtje'

    training_n_samples = int(config['sorel20mDataset']['training_n_samples'])
    validation_n_samples = int(config['sorel20mDataset']['validation_n_samples'])
    test_n_samples = int(config['sorel20mDataset']['test_n_samples'])

    c_l_epochs = int(config['contrastiveLearning']['epochs'])
    c_l_train_split_proportion = int(config['contrastiveLearning']['train_split_proportion'])
    c_l_valid_split_proportion = int(config['contrastiveLearning']['valid_split_proportion'])
    c_l_test_split_proportion = int(config['contrastiveLearning']['test_split_proportion'])
    c_l_batch_size = int(config['contrastiveLearning']['batch_size'])
    c_l_rank_size = int(config['contrastiveLearning']['rank_size'])
    c_l_knn_k_min = int(config['contrastiveLearning']['knn_k_min'])
    c_l_knn_k_max = int(config['contrastiveLearning']['knn_k_max'])

    # initialize Hash object
    ch = Hash()
    # update hash with the content of the config file (for the current net type)
    ch.update(json.dumps(dict(config.items(net_type))))
    # update hash with the content of the config file (for the freshDataset)
    ch.update(json.dumps(dict(config.items('freshDataset'))))
    # get config file sha256 digest
    fresh_eval_config_sha = ch.get_b64()

    # update hash with the content of the config file (for the freshDataset)
    ch.update(json.dumps(dict(config.items('contrastiveLearning'))))
    # get config file sha256 digest
    contr_learn_config_sha = ch.get_b64()

    # Note: The entrypoint names are defined in MLproject. The artifact directories
    # are documented by each step's .py file.

    # start mlflow run
    with mlflow.start_run() as active_run:
        # get code git commit version
        git_commit = active_run.data.tags.get(mlflow_tags.MLFLOW_GIT_COMMIT)

        # log config file
        mlflow.log_text(json.dumps({s: dict(config.items(s)) for s in config.sections()}), 'config.txt')

        # set dataset destination dir
        dataset_dir = os.path.join(base_dir, 'dataset')
        # set fresh dataset base path (directory containing .dat files)
        fresh_dataset_dir = os.path.join(dataset_dir, 'fresh_dataset')

        # if the fresh dataset is not present, generate it
        if not fresh_check_files(fresh_dataset_dir):
            logger.info("Fresh dataset not found.")

            # generate fresh dataset
            build_fresh_dataset_run = run("build_fresh_dataset", {
                'dataset_dest_dir': fresh_dataset_dir
            }, config_sha=fresh_eval_config_sha)

        c_l_results_files = {}

        # for each training run
        for training_run_id in range(runs):
            logger.info("initiating training run n. {}".format(str(training_run_id)))

            # create family classifier from previously trained network and train it on fresh dataset
            c_l_train_run = get_or_run("train_contrastive_net", {
                'fresh_ds_path': fresh_dataset_dir,
                'epochs': c_l_epochs,
                'training_run': training_run_id,
                'train_split_proportion': c_l_train_split_proportion,
                'valid_split_proportion': c_l_valid_split_proportion,
                'test_split_proportion': c_l_test_split_proportion,
                'batch_size': c_l_batch_size
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=contr_learn_config_sha)

            # get model checkpoints path
            c_l_checkpoint_path = parse.unquote(parse.urlparse(os.path.join(c_l_train_run.info.artifact_uri,
                                                                            "model_checkpoints")).path)

            # set model checkpoint filename
            c_l_checkpoint_file = os.path.join(c_l_checkpoint_path, "epoch_{}.pt".format(c_l_epochs))

            # evaluate model against fresh dataset
            c_l_eval_run = get_or_run("evaluate_contrastive_net", {
                'fresh_ds_path': fresh_dataset_dir,
                'checkpoint_path': c_l_checkpoint_file,
                'training_run': training_run_id,
                'train_split_proportion': c_l_train_split_proportion,
                'valid_split_proportion': c_l_valid_split_proportion,
                'test_split_proportion': c_l_test_split_proportion,
                'batch_size': c_l_batch_size,
                'rank_size': c_l_rank_size,
                'knn_k_min': c_l_knn_k_min,
                'knn_k_max': c_l_knn_k_max
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=contr_learn_config_sha)

            # get model evaluation results path
            c_l_results_path = parse.unquote(parse.urlparse(os.path.join(c_l_eval_run.info.artifact_uri,
                                                                         "contrastive_learning_results")).path)

            # set model evaluation results filename
            c_l_results_file = os.path.join(c_l_results_path, "results.csv")

            # compute (and plot) all contrastive model results
            c_l_compute_results_run = get_or_run("compute_contrastive_learning_results", {
                'results_file': c_l_results_file,
                'fresh_ds_path': fresh_dataset_dir,
                'knn_k_min': c_l_knn_k_min,
                'knn_k_max': c_l_knn_k_max
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=contr_learn_config_sha)

            # get model evaluation results path
            c_l_scores_dir_path = parse.unquote(parse.urlparse(os.path.join(c_l_compute_results_run.info.artifact_uri,
                                                                            "contrastive_learning_scores")).path)

            # add dir path to c_l_results_files dictionary (used for plotting mean score trends)
            c_l_results_files["run_id_" + str(training_run_id)] = c_l_scores_dir_path

        # create contrastive learning temp dir name using the value from config_sha (sha of some parts of the config
        # file). -> This is done in order to have a different (but predictable) run_to_filename at each set of runs with
        # different parameters. This allows mlflow to know when it is needed to run 'per_tag_plot_runs'. If, on the
        # other hand a simple tempfile.TemporaryDirectory() was used then mlflow would run 'per_tag_plot_runs' every
        # time, even if a precedent run was available (because the parameter 'run_to_filename_json' would be different)
        c_l_tempdir = os.path.join(base_dir, 'tmp_{}'.format(contr_learn_config_sha))
        # create temp dir
        os.makedirs(c_l_tempdir, exist_ok=True)

        # set run-to-filename file path
        c_l_run_to_filename = os.path.join(c_l_tempdir, "c_l_results.json")

        # create and open the c_l_results.json file in write mode
        with open(c_l_run_to_filename, "w") as output_file:
            # save c_l_results_files dictionary as a json file
            json.dump(c_l_results_files, output_file)

        mlflow.log_artifact(c_l_run_to_filename, "run_to_filename")

        # if there is more than 1 run, compute also the model mean scores trends
        if runs > 1:
            # plot all model mean scores trends
            plot_all_scores_trends = get_or_run("plot_all_scores_trends", {
                'run_to_filename_json': c_l_run_to_filename,
                'knn_k_min': c_l_knn_k_min,
                'knn_k_max': c_l_knn_k_max
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=contr_learn_config_sha)

        # remove contrastive learning temp files and temporary directory
        os.remove(c_l_run_to_filename)
        # os.remove(fresh_run_to_filename)
        os.rmdir(c_l_tempdir)
def family_classifier_only(base_dir,  # base tool path
                           use_cache=1,  # whether to skip already executed runs (in cache) or not (1/0)
                           ignore_git=0):  # whether to ignore git version or not (1/0)
    """ Family Classifier Only (with no Trasnfer Learning) workflow.

    Args:
        base_dir: Base tool path
        use_cache: Whether to skip already executed runs (in cache) or not (1/0) (default: 1)
        ignore_git: Whether to ignore git version or not (1/0) (default: 0)
    """

    # get some needed variables from config file
    runs = int(config['general']['runs'])
    net_type = 'mtje'

    training_n_samples = int(config['sorel20mDataset']['training_n_samples'])
    validation_n_samples = int(config['sorel20mDataset']['validation_n_samples'])
    test_n_samples = int(config['sorel20mDataset']['test_n_samples'])

    f_c_epochs = int(config['familyClassifier']['epochs'])
    f_c_train_split_proportion = int(config['familyClassifier']['train_split_proportion'])
    f_c_valid_split_proportion = int(config['familyClassifier']['valid_split_proportion'])
    f_c_test_split_proportion = int(config['familyClassifier']['test_split_proportion'])
    f_c_batch_size = int(config['familyClassifier']['batch_size'])

    # initialize Hash object
    ch = Hash()
    # update hash with the content of the config file (for the current net type)
    ch.update(json.dumps(dict(config.items(net_type))))
    # update hash with the content of the config file (for the freshDataset)
    ch.update(json.dumps(dict(config.items('freshDataset'))))
    # get config file sha256 digest
    fresh_eval_config_sha = ch.get_b64()

    # update hash with the content of the config file (for the freshDataset)
    ch.update(json.dumps(dict(config.items('familyClassifier'))))
    # get config file sha256 digest
    family_class_config_sha = ch.get_b64()

    # instantiate key-n_samples dict
    n_samples_dict = {'train': training_n_samples,
                      'validation': validation_n_samples,
                      'test': test_n_samples}

    # Note: The entrypoint names are defined in MLproject. The artifact directories
    # are documented by each step's .py file.

    # start mlflow run
    with mlflow.start_run() as active_run:
        # get code git commit version
        git_commit = active_run.data.tags.get(mlflow_tags.MLFLOW_GIT_COMMIT)

        # log config file
        mlflow.log_text(json.dumps({s: dict(config.items(s)) for s in config.sections()}), 'config.txt')

        # set dataset destination dir
        dataset_dir = os.path.join(base_dir, 'dataset')
        # set fresh dataset base path (directory containing .dat files)
        fresh_dataset_dir = os.path.join(dataset_dir, 'fresh_dataset')

        # if the fresh dataset is not present, generate it
        if not fresh_check_files(fresh_dataset_dir):
            logger.info("Fresh dataset not found.")

            # generate fresh dataset
            build_fresh_dataset_run = run("build_fresh_dataset", {
                'dataset_dest_dir': fresh_dataset_dir
            }, config_sha=fresh_eval_config_sha)

        # for each training run
        for training_run_id in range(runs):
            logger.info("initiating training run n. {}".format(str(training_run_id)))

            # create family classifier from previously trained network and train it on fresh dataset
            f_c_train_run = get_or_run("train_family_classifier", {
                'fresh_ds_path': fresh_dataset_dir,
                'epochs': f_c_epochs,
                'training_run': training_run_id,
                'train_split_proportion': f_c_train_split_proportion,
                'valid_split_proportion': f_c_valid_split_proportion,
                'test_split_proportion': f_c_test_split_proportion,
                'batch_size': f_c_batch_size
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=family_class_config_sha)

            # get model checkpoints path
            f_c_checkpoint_path = parse.unquote(parse.urlparse(os.path.join(f_c_train_run.info.artifact_uri,
                                                                            "model_checkpoints")).path)

            # set model checkpoint filename
            f_c_checkpoint_file = os.path.join(f_c_checkpoint_path, "epoch_{}.pt".format(f_c_epochs))

            # evaluate model against fresh dataset
            f_c_eval_run = get_or_run("evaluate_family_classifier", {
                'fresh_ds_path': fresh_dataset_dir,
                'checkpoint_path': f_c_checkpoint_file,
                'training_run': training_run_id,
                'train_split_proportion': f_c_train_split_proportion,
                'valid_split_proportion': f_c_valid_split_proportion,
                'test_split_proportion': f_c_test_split_proportion,
                'batch_size': f_c_batch_size
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=family_class_config_sha)

            # get model evaluation results path
            f_c_results_path = parse.unquote(parse.urlparse(os.path.join(f_c_eval_run.info.artifact_uri,
                                                                         "family_class_results")).path)

            # set model evaluation results filename
            f_c_results_file = os.path.join(f_c_results_path, "results.csv")

            # compute (and plot) all tagging results
            f_c_compute_results_run = get_or_run("compute_all_family_class_results", {
                'results_file': f_c_results_file,
                'fresh_ds_path': fresh_dataset_dir
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=family_class_config_sha)
def workflow(base_dir,  # base tool path
             use_cache=1,  # whether to skip already executed runs (in cache) or not (1/0)
             ignore_git=0):  # whether to ignore git version or not (1/0)
    """ Automatic Malware Signature Generation MLflow workflow.

    Args:
        base_dir: Base tool path
        use_cache: Whether to skip already executed runs (in cache) or not (1/0) (default: 1)
        ignore_git: Whether to ignore git version or not (1/0) (default: 0)
    """

    # get some needed variables from config file
    runs = int(config['general']['runs'])
    workers = int(config['general']['workers'])

    batch_size = int(config['mtje']['batch_size'])
    epochs = int(config['mtje']['epochs'])
    use_malicious_labels = int(config['mtje']['use_malicious_labels'])
    use_count_labels = int(config['mtje']['use_count_labels'])
    gen_type = config['mtje']['gen_type']
    similarity_measure = config['mtje']['similarity_measure'].lower()
    net_type = 'mtje'

    training_n_samples = int(config['sorel20mDataset']['training_n_samples'])
    validation_n_samples = int(config['sorel20mDataset']['validation_n_samples'])
    test_n_samples = int(config['sorel20mDataset']['test_n_samples'])

    min_n_anchor_samples = int(config['freshDataset']['min_n_anchor_samples'])
    max_n_anchor_samples = int(config['freshDataset']['max_n_anchor_samples'])
    fresh_n_queries = int(config['freshDataset']['n_queries'])
    n_evaluations = int(config['freshDataset']['n_evaluations'])

    f_c_epochs = int(config['familyClassifier']['epochs'])
    f_c_train_split_proportion = int(config['familyClassifier']['train_split_proportion'])
    f_c_valid_split_proportion = int(config['familyClassifier']['valid_split_proportion'])
    f_c_test_split_proportion = int(config['familyClassifier']['test_split_proportion'])
    f_c_batch_size = int(config['familyClassifier']['batch_size'])

    c_l_epochs = int(config['contrastiveLearning']['epochs'])
    c_l_train_split_proportion = int(config['contrastiveLearning']['train_split_proportion'])
    c_l_valid_split_proportion = int(config['contrastiveLearning']['valid_split_proportion'])
    c_l_test_split_proportion = int(config['contrastiveLearning']['test_split_proportion'])
    c_l_batch_size = int(config['contrastiveLearning']['batch_size'])
    c_l_rank_size = int(config['contrastiveLearning']['rank_size'])
    c_l_knn_k_min = int(config['contrastiveLearning']['knn_k_min'])
    c_l_knn_k_max = int(config['contrastiveLearning']['knn_k_max'])

    # initialize Hash object
    ch = Hash()

    # update hash with the content of the config file (for the current net type)
    ch.update(json.dumps(dict(config.items('sorel20mDataset'))))
    # get config file sha256 digest
    dataset_config_sha = ch.get_b64()

    # update hash with the content of the config file (for the current net type)
    ch.update(json.dumps(dict(config.items(net_type))))
    # get config file sha256 digest
    config_sha = ch.get_b64()

    # update hash with the content of the config file (for the freshDataset)
    ch.update(json.dumps(dict(config.items('freshDataset'))))
    # get config file sha256 digest
    fresh_dataset_config_sha = ch.get_b64()

    # create copy of the current config hash digest
    ch_copy = ch.copy()

    # update hash with the content of the config file (for the freshDataset)
    ch.update(json.dumps(dict(config.items('familyClassifier'))))
    # get config file sha256 digest
    family_class_config_sha = ch.get_b64()

    # update hash with the content of the config file (for the freshDataset)
    ch_copy.update(json.dumps(dict(config.items('contrastiveLearning'))))
    # get config file sha256 digest
    contr_learn_config_sha = ch_copy.get_b64()

    # instantiate key-n_samples dict
    n_samples_dict = {'train': training_n_samples,
                      'validation': validation_n_samples,
                      'test': test_n_samples}

    # Note: The entrypoint names are defined in MLproject. The artifact directories
    # are documented by each step's .py file.

    # start mlflow run
    with mlflow.start_run() as active_run:
        # get code git commit version
        git_commit = active_run.data.tags.get(mlflow_tags.MLFLOW_GIT_COMMIT)

        # log config file
        mlflow.log_text(json.dumps({s: dict(config.items(s)) for s in config.sections()}), 'config.txt')

        # set dataset destination dir
        dataset_dir = os.path.join(base_dir, 'dataset')
        # set dataset base path (directory containing 'meta.db')
        dataset_base_path = os.path.join(dataset_dir, '09-DEC-2020', 'processed-data')
        # set pre-processed dataset base path (directory containing .dat files)
        pre_processed_dataset_dir = os.path.join(dataset_dir, '09-DEC-2020', 'pre-processed_dataset')
        # set fresh dataset base path (directory containing .dat files)
        fresh_dataset_dir = os.path.join(dataset_dir, 'fresh_dataset')

        # if pre-processed dataset files for this run parameters are not present, generate them
        if not preproc_check_files(destination_dir=pre_processed_dataset_dir,
                                   n_samples_dict=n_samples_dict):
            logger.info("Pre-processed dataset not found.")

            # if the original Sorel20M dataset is not present, download it
            if not download_check_files(dataset_dir):
                logger.info("Dataset not found.")

                # run dataset downloader
                download_dataset_run = run("download_dataset", {
                    'destination_dir': dataset_dir
                }, config_sha=dataset_config_sha)

            # pre-process dataset
            preprocess_dataset_run = run("preprocess_dataset", {
                'ds_path': dataset_base_path,
                'destination_dir': pre_processed_dataset_dir,
                'training_n_samples': training_n_samples,
                'validation_n_samples': validation_n_samples,
                'test_n_samples': test_n_samples,
                'batch_size': batch_size,
                'remove_missing_features': str(os.path.join(dataset_base_path, "shas_missing_ember_features.json"))
            }, config_sha=dataset_config_sha)

        # if the fresh dataset is not present, generate it
        if not fresh_check_files(fresh_dataset_dir):
            logger.info("Fresh dataset not found.")

            # generate fresh dataset
            build_fresh_dataset_run = run("build_fresh_dataset", {
                'dataset_dest_dir': fresh_dataset_dir
            }, config_sha=fresh_dataset_config_sha)

        # initialize results files dicts
        results_files = {}
        c_l_results_files = {}

        # instantiate common (between consecutive training runs) training parameters
        common_training_params = {
            'ds_path': pre_processed_dataset_dir,
            'net_type': net_type if similarity_measure == 'dot' else net_type + '_{}'.format(similarity_measure),
            'gen_type': gen_type,
            'batch_size': batch_size,
            'epochs': epochs,
            'training_n_samples': training_n_samples,
            'validation_n_samples': validation_n_samples,
            'use_malicious_labels': use_malicious_labels,
            'use_count_labels': use_count_labels,
            'workers': workers
        }

        # instantiate common (between consecutive training runs) evaluation parameters
        common_evaluation_params = {
            'ds_path': pre_processed_dataset_dir,
            'net_type': net_type if similarity_measure == 'dot' else net_type + '_{}'.format(similarity_measure),
            'gen_type': gen_type,
            'batch_size': batch_size,
            'test_n_samples': test_n_samples,
            'evaluate_malware': use_malicious_labels,
            'evaluate_count': use_count_labels
        }

        # for each training run
        for training_run_id in range(runs):
            logger.info("initiating training run n. {}".format(str(training_run_id)))

            # -- Model Training and Evaluation Steps -------------------------------------------------------------------
            # set training parameters
            training_params = common_training_params
            training_params.update({'training_run': training_run_id})

            # train network (get or run) on Sorel20M dataset
            training_run = get_or_run("train_network",
                                      training_params,
                                      git_commit,
                                      ignore_git=bool(ignore_git),
                                      use_cache=bool(use_cache),
                                      resume=True,
                                      config_sha=config_sha)

            # get model checkpoints path
            checkpoint_path = parse.unquote(parse.urlparse(os.path.join(training_run.info.artifact_uri,
                                                                        "model_checkpoints")).path)

            # set model checkpoint filename
            checkpoint_file = os.path.join(checkpoint_path, "epoch_{}.pt".format(epochs))

            # set evaluation parameters
            evaluation_params = common_evaluation_params
            evaluation_params.update({'checkpoint_file': checkpoint_file})

            # evaluate model against Sorel20M dataset
            evaluation_run = get_or_run("evaluate_network",
                                        evaluation_params,
                                        git_commit,
                                        ignore_git=bool(ignore_git),
                                        use_cache=bool(use_cache),
                                        config_sha=config_sha)

            # get model evaluation results path
            results_path = parse.unquote(parse.urlparse(os.path.join(evaluation_run.info.artifact_uri,
                                                                     "model_results")).path)

            # set model evaluation results filename
            results_file = os.path.join(results_path, "results.csv")

            # add file path to results_files dictionary (used for plotting mean results)
            results_files["run_id_" + str(training_run_id)] = results_file

            # compute (and plot) all tagging results
            all_tagging_results_run = get_or_run("compute_all_run_results", {
                'results_file': results_file,
                'use_malicious_labels': use_malicious_labels,
                'use_tag_labels': 1
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=config_sha)
            # ----------------------------------------------------------------------------------------------------------

            # -- Model Evaluation using Fresh Dataset Steps ------------------------------------------------------------
            # evaluate model against fresh dataset
            fresh_evaluation_run = get_or_run("evaluate_fresh", {
                'fresh_ds_path': fresh_dataset_dir,
                'checkpoint_path': checkpoint_file,
                'net_type': net_type if similarity_measure == 'dot' else net_type + '_{}'.format(similarity_measure),
                'min_n_anchor_samples': min_n_anchor_samples,
                'max_n_anchor_samples': max_n_anchor_samples,
                'n_query_samples': fresh_n_queries,
                'n_evaluations': n_evaluations
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=fresh_dataset_config_sha)

            # get model evaluation results path
            fresh_results_path = parse.unquote(parse.urlparse(os.path.join(fresh_evaluation_run.info.artifact_uri,
                                                                           "fresh_prediction_results")).path)

            # set model evaluation results filename
            fresh_results_file = os.path.join(fresh_results_path, "fresh_prediction_results.json")

            # compute (and plot) all family prediction results (on fresh dataset)
            all_tagging_results_run = get_or_run("compute_all_run_fresh_results", {
                'results_file': fresh_results_file
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=fresh_dataset_config_sha)
            # ----------------------------------------------------------------------------------------------------------

            # -- Family Classifier Steps -------------------------------------------------------------------------------
            # create family classifier from previously trained network and train it on fresh dataset
            f_c_train_run = get_or_run("train_family_classifier", {
                'fresh_ds_path': fresh_dataset_dir,
                'checkpoint_path': checkpoint_file,
                'epochs': f_c_epochs,
                'training_run': training_run_id,
                'train_split_proportion': f_c_train_split_proportion,
                'valid_split_proportion': f_c_valid_split_proportion,
                'test_split_proportion': f_c_test_split_proportion,
                'batch_size': f_c_batch_size
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=family_class_config_sha)

            # get model checkpoints path
            f_c_checkpoint_path = parse.unquote(parse.urlparse(os.path.join(f_c_train_run.info.artifact_uri,
                                                                            "model_checkpoints")).path)

            # set model checkpoint filename
            f_c_checkpoint_file = os.path.join(f_c_checkpoint_path, "epoch_{}.pt".format(f_c_epochs))

            # evaluate model against fresh dataset
            f_c_eval_run = get_or_run("evaluate_family_classifier", {
                'fresh_ds_path': fresh_dataset_dir,
                'checkpoint_path': f_c_checkpoint_file,
                'training_run': training_run_id,
                'train_split_proportion': f_c_train_split_proportion,
                'valid_split_proportion': f_c_valid_split_proportion,
                'test_split_proportion': f_c_test_split_proportion,
                'batch_size': f_c_batch_size
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=family_class_config_sha)

            # get model evaluation results path
            f_c_results_path = parse.unquote(parse.urlparse(os.path.join(f_c_eval_run.info.artifact_uri,
                                                                         "family_class_results")).path)

            # set model evaluation results filename
            f_c_results_file = os.path.join(f_c_results_path, "results.csv")

            # compute (and plot) all tagging results
            f_c_compute_results_run = get_or_run("compute_all_family_class_results", {
                'results_file': f_c_results_file,
                'fresh_ds_path': fresh_dataset_dir
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=family_class_config_sha)
            # ----------------------------------------------------------------------------------------------------------

            # -- Contrastive Learning Steps ----------------------------------------------------------------------------
            # create family classifier from previously trained network and train it on fresh dataset
            c_l_train_run = get_or_run("train_contrastive_model", {
                'fresh_ds_path': fresh_dataset_dir,
                'checkpoint_path': checkpoint_file,
                'epochs': c_l_epochs,
                'training_run': training_run_id,
                'train_split_proportion': c_l_train_split_proportion,
                'valid_split_proportion': c_l_valid_split_proportion,
                'test_split_proportion': c_l_test_split_proportion,
                'batch_size': c_l_batch_size
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=contr_learn_config_sha)

            # get model checkpoints path
            c_l_checkpoint_path = parse.unquote(parse.urlparse(os.path.join(c_l_train_run.info.artifact_uri,
                                                                            "model_checkpoints")).path)

            # set model checkpoint filename
            c_l_checkpoint_file = os.path.join(c_l_checkpoint_path, "epoch_{}.pt".format(c_l_epochs))

            # evaluate model against fresh dataset
            c_l_eval_run = get_or_run("evaluate_contrastive_model", {
                'fresh_ds_path': fresh_dataset_dir,
                'checkpoint_path': c_l_checkpoint_file,
                'training_run': training_run_id,
                'train_split_proportion': c_l_train_split_proportion,
                'valid_split_proportion': c_l_valid_split_proportion,
                'test_split_proportion': c_l_test_split_proportion,
                'batch_size': c_l_batch_size,
                'rank_size': c_l_rank_size,
                'knn_k_min': c_l_knn_k_min,
                'knn_k_max': c_l_knn_k_max
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=contr_learn_config_sha)

            # get model evaluation results path
            c_l_results_path = parse.unquote(parse.urlparse(os.path.join(c_l_eval_run.info.artifact_uri,
                                                                         "contrastive_learning_results")).path)

            # set model evaluation results filename
            c_l_results_file = os.path.join(c_l_results_path, "results.csv")

            # compute (and plot) all tagging results
            c_l_compute_results_run = get_or_run("compute_contrastive_learning_results", {
                'results_file': c_l_results_file,
                'fresh_ds_path': fresh_dataset_dir,
                'knn_k_min': c_l_knn_k_min,
                'knn_k_max': c_l_knn_k_max
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=contr_learn_config_sha)

            # get model evaluation results path
            c_l_scores_dir_path = parse.unquote(parse.urlparse(os.path.join(c_l_compute_results_run.info.artifact_uri,
                                                                            "contrastive_learning_scores")).path)

            # add dir path to c_l_results_files dictionary (used for plotting mean score trends)
            c_l_results_files["run_id_" + str(training_run_id)] = c_l_scores_dir_path
            # ----------------------------------------------------------------------------------------------------------

        # create temp dir name using the value from config_sha (sha of some parts of the config file).
        # -> This is done in order to have a different (but predictable) run_to_filename at each set of runs with
        # different parameters. This allows mlflow to know when it is needed to run 'per_tag_plot_runs'. If, on the
        # other hand a simple tempfile.TemporaryDirectory() was used then mlflow would run 'per_tag_plot_runs' every
        # time, even if a precedent run was available (because the parameter 'run_to_filename_json' would be different)
        tempdir = os.path.join(base_dir, 'tmp_{}'.format(config_sha))
        # create temp dir
        os.makedirs(tempdir, exist_ok=True)

        # create contrastive learning temp dir name using the value from config_sha (sha of some parts of the config
        # file). -> This is done in order to have a different (but predictable) run_to_filename at each set of runs with
        # different parameters. This allows mlflow to know when it is needed to run 'per_tag_plot_runs'. If, on the
        # other hand a simple tempfile.TemporaryDirectory() was used then mlflow would run 'per_tag_plot_runs' every
        # time, even if a precedent run was available (because the parameter 'run_to_filename_json' would be different)
        c_l_tempdir = os.path.join(base_dir, 'tmp_{}'.format(contr_learn_config_sha))
        # create temp dir
        os.makedirs(c_l_tempdir, exist_ok=True)

        # set run-to-filename file path
        run_to_filename = os.path.join(tempdir, "results.json")

        # create and open the results.json file in write mode
        with open(run_to_filename, "w") as output_file:
            # save results_files dictionary as a json file
            json.dump(results_files, output_file)

        mlflow.log_artifact(run_to_filename, "run_to_filename")

        # set run-to-filename file path
        c_l_run_to_filename = os.path.join(c_l_tempdir, "c_l_results.json")

        # create and open the c_l_results.json file in write mode
        with open(c_l_run_to_filename, "w") as output_file:
            # save c_l_results_files dictionary as a json file
            json.dump(c_l_results_files, output_file)

        mlflow.log_artifact(c_l_run_to_filename, "run_to_filename")

        # if there is more than 1 run, compute also per-tag mean results
        if runs > 1:
            # plot all roc distributions
            per_tag_plot_runs = get_or_run("plot_all_roc_distributions", {
                'run_to_filename_json': run_to_filename,
                'use_malicious_labels': use_malicious_labels,
                'use_tag_labels': 1
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=config_sha)

            # plot all model mean scores trends
            plot_all_scores_trends = get_or_run("plot_all_contrastive_scores_trends", {
                'run_to_filename_json': c_l_run_to_filename,
                'knn_k_min': c_l_knn_k_min,
                'knn_k_max': c_l_knn_k_max
            }, git_commit, ignore_git=bool(ignore_git), use_cache=bool(use_cache), config_sha=contr_learn_config_sha)

        # remove temp files and temporary directory
        os.remove(run_to_filename)
        # os.remove(fresh_run_to_filename)
        os.rmdir(tempdir)

        # remove contrastive learning temp files and temporary directory
        os.remove(c_l_run_to_filename)
        # os.remove(fresh_run_to_filename)
        os.rmdir(c_l_tempdir)
def build_fresh_dataset(
        dataset_dest_dir):  # dir where to write the newly created dataset
    """ Build fresh dataset retrieving samples from Malware Bazaar given a list of  malware families stored in a
    configuration file.

    Args:
        dataset_dest_dir: Dir where to write the newly created dataset
    """

    # start mlflow run
    with mlflow.start_run():
        # log some params
        mlflow.log_param("amount_each", amount_each)
        mlflow.log_param("number_of_families", number_of_families)

        # crate fresh_dataset_dest_dir if it did not already exist
        os.makedirs(dataset_dest_dir, exist_ok=True)
        raw_features_dest_file = os.path.join(dataset_dest_dir,
                                              'raw_features.json')
        sig_to_label_file = os.path.join(dataset_dest_dir, 'sig_to_label.json')

        # initialize sig - label dictionary
        sig_to_label = {}

        # create temporary directory
        with tempfile.TemporaryDirectory() as tempdir:
            # set samples_dir and metadata_dir
            samples_dir = os.path.join(tempdir, "samples")
            metadata_dir = os.path.join(tempdir, "metadata")
            metadata_file_path = os.path.join(metadata_dir, 'metadata.json')

            # create directories
            os.makedirs(samples_dir, exist_ok=True)
            os.makedirs(metadata_dir, exist_ok=True)

            # write empty json object to file (in preparation for download)
            with open(metadata_file_path, "w") as json_:
                json.dump({}, json_)

            # get malware bazaar full data dump and retrieve from it all the sha256 hashes of samples available
            # for the families of interest
            available_samples_dict = api.get_full_data_dump(tempdir, families)

            i = 0
            # for each family in 'families'
            for fam in families:
                # if we successfully downloaded 'amount' samples for 'number_of_families' families, break
                if i >= number_of_families:
                    break

                logger.info("Considering now family '{}'. {}/{}".format(
                    fam, i + 1, number_of_families))

                # download 'amount_each' samples, if the download was successful update i, otherwise ignore family
                # and go on
                if download_and_extract(
                        available_data=available_samples_dict,
                        family=fam,
                        label=i,
                        dest_dir=samples_dir,
                        metadata_file_path=metadata_file_path,
                        raw_features_dest_file=raw_features_dest_file,
                        amount=amount_each,
                        unzip=True):
                    sig_to_label[fam] = i
                    i += 1

            # if the number of successful family downloads is less than the required amount, exit
            if i < number_of_families:
                logger.error(
                    "It was not possible to get {} samples for {} different families.\n"
                    "Try adding more families in the config file.".format(
                        amount_each, number_of_families))
                sys.exit(1)

            # log used families
            mlflow.log_text(
                "{}".format('\n'.join("{}: {}".format(str(sig), i)
                                      for sig, i in sig_to_label.items())),
                "families.txt")

            # log metadata file
            mlflow.log_artifact(metadata_file_path, "metadata")

            # dump sig_to_label dictionary to file
            with open(sig_to_label_file, 'w') as sig_to_label_file:
                json.dump(sig_to_label, sig_to_label_file)

        # create list of files containing features (there is only one in this case)
        raw_features_paths = [raw_features_dest_file]
        # create features and labels vectors from raw features
        create_vectorized_features(dataset_dest_dir=dataset_dest_dir,
                                   raw_features_paths=raw_features_paths)
def download_and_extract(
    available_data,  # dict with the list of sha256 hashes of all available samples for each family
    family,  # family to retrieve metadata of
    label,  # numerical label to use for the current family
    dest_dir,  # destination directory where to save files
    metadata_file_path,  # file where to save samples metadata
    raw_features_dest_file,  # name of the file where to save the downloaded files' raw features
    amount=10,  # amount of samples to retrieve metadata of
    unzip=False):  # whether to unzip downloaded file or not
    """ Download 'amount' malware samples (and relative metadata) associated with the provided tag/signature
    from Malware Bazaar.

    Args:
        available_data: Dictionary containing the list of sha256 hashes of all available files for each family
        family: Family to retrieve metadata of
        label: Numerical label to use for the current family
        dest_dir: Destination directory where to save file
        metadata_file_path: File where to save samples metadata
        raw_features_dest_file: Name of the file where to save the downloaded files' raw features
        amount: Amount of samples to retrieve metadata of (default: 10)
        unzip: Whether to unzip downloaded file or not (default: False)
    Returns:
        True if it managed to download exactly 'amount' samples for the current family, False otherwise.
    """

    logger.info(
        "Retrieving samples metadata for family '{}'...".format(family))

    # get the list of sha256 hashes of all available files for the current selected family
    available_samples_shas = [
        sample['sha256_hash'] for sample in available_data[family]
    ]

    # if the number of available files is less than the required amount log warning and return false
    if len(available_samples_shas) < amount:
        logger.warning(
            "Found only {} PE malware samples. Ignoring family '{}'..".format(
                available_samples_shas, family))
        return False

    # initialize list that will contain the names of the downloaded files
    files_downloaded = []

    i = 0
    # open metadata file
    with open(metadata_file_path, 'r+') as metadata_file:
        # load existing data into a dict
        metadata = json.load(metadata_file)

        # instantiate download arguments iterator getting info from the first 'amount' files in list of available shas
        argument_iterator = ((sha, dest_dir, unzip)
                             for sha in available_samples_shas)

        # prepare progress bar
        with tqdm(total=amount) as pbar:
            pbar.set_description(
                "Downloading samples and extracting features for family '{}'".
                format(family))

            # instantiate thread-pool with a number of threads equal to 'cores'
            with ThreadPool(2 * cores) as pool:

                # launch parallel downloading processes
                for malware_info, downloaded_names in pool.imap_unordered(
                        retrieve_malware_sample, argument_iterator):
                    # if we downloaded 'amount' malware samples for this family, break
                    if i >= amount:
                        break

                    # if the downloaded malware sample name is None -> the file could not be found on Malware Bazaar,
                    # ignore it; otherwise try extracting the raw features from the file; if the extraction process is
                    # not successful -> ignore file
                    if downloaded_names is not None and extract_raw_features(
                            os.path.join(dest_dir, downloaded_names[0]),
                            raw_features_dest_file, label):
                        # set data to write to file
                        new_data = {malware_info['sha256_hash']: malware_info}
                        # join new_data with metadata
                        metadata.update(new_data)

                        # append malware sample name to global file name list
                        files_downloaded.append(downloaded_names[0])

                        # update i
                        i += 1

                        # update tqdm progress bar
                        pbar.update(1)

                # terminate thread pool and join
                pool.terminate()
                pool.join()

        # if the amount of malware samples downloaded for this family is less than required, return false
        if i < amount:
            return False

        # set metadata file current position at offset 0
        metadata_file.seek(0)
        # convert back to json and save to file
        json.dump(metadata, metadata_file)

    # if we manage to download exactly 'amount' samples for the current family log downloaded files as text
    # and then return true
    mlflow.log_text(
        "{}".format('\n'.join(sample for sample in files_downloaded)),
        str(
            os.path.join("downloaded_samples",
                         "downloaded_{}_samples.txt".format(family))))

    return True