Exemple #1
0
def runExperiment(runname, mlepConfig, experiment_name, expstatuslog,
                  earlystop):

    # set up mlflow access
    # mlflow.set_tracking_uri -- not needed, defaults to mlruns
    # mlflow.create_experiment -- need experiment name. Should I programmatically create one? or go by timestamp
    if expstatuslog:
        sys.stdout = open(LOG_FILE, "w")
    else:
        sys.stdout = dumbwrite()

    mlflow.set_tracking_uri("mysql://*****:*****@127.0.0.1:3306/mlflow_runs")
    mlflow.start_run(run_name=runname)

    # Log relevant details
    for _key in mlepConfig["config"]:
        # possible error
        if _key != "drift_metrics":
            mlflow.log_param(_key, mlepConfig["config"][_key])
    mlflow.log_param("experiment_name", experiment_name)

    internalTimer = 0
    streamData = StreamLocal.StreamLocal(
        data_source="data/2014_to_dec2018.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)

    augmentation = BatchedLocal.BatchedLocal(
        data_source='data/collectedIrrelevant.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    augmentation.load_by_class()

    trainingData = BatchedLocal.BatchedLocal(
        data_source='data/initialTrainingData.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    trainingData.load()

    # Now we have the data
    MLEPLearner = MLEPServer.MLEPLearningServer(config_dict=mlepConfig,
                                                safe_mode=False)

    # Perform initial traininig
    MLEPLearner.initialTrain(traindata=trainingData)
    io_utils.std_flush("Completed training at", time_utils.readable_time())
    MLEPLearner.addAugmentation(augmentation)
    io_utils.std_flush("Added augmentation at", time_utils.readable_time())

    totalCounter = 0.0
    mistakes = []
    _earlystopcond = False

    while streamData.next() and not _earlystopcond:
        if internalTimer < streamData.getObject().getValue("timestamp"):
            internalTimer = streamData.getObject().getValue("timestamp")
            MLEPLearner.updateTime(internalTimer)

        classification = MLEPLearner.classify(streamData.getObject())

        totalCounter += 1.0
        if classification != streamData.getLabel():
            mistakes.append(1.0)
        else:
            mistakes.append(0.0)
        if totalCounter % 1000 == 0 and totalCounter > 0.0:
            io_utils.std_flush("Completed", int(totalCounter),
                               " samples, with running error (past 100) of",
                               sum(mistakes[-100:]) / 100.0)
        if earlystop and totalCounter == earlystop:
            _earlystopcond = True
        if totalCounter % 100 == 0 and totalCounter > 0.0:
            running_error = sum(mistakes[-100:]) / 100.0
            mlflow.log_metric("running_err" + str(int(totalCounter / 100)),
                              running_error)

    MLEPLearner.shutdown()

    io_utils.std_flush(
        "\n-----------------------------\nCOMPLETED\n-----------------------------\n"
    )

    mlflow.log_param("total_samples", totalCounter)
    if expstatuslog:
        mlflow.log_artifact(LOG_FILE)
    mlflow.log_param("run_complete", True)
    mlflow.end_run()

    if expstatuslog:
        sys.stdout.close()
        sys.stdout = sys.__stdout__
    else:
        sys.stdout = sys.__stdout__
Exemple #2
0
 def __exit__(self, type, value, traceback):
     mlflow.end_run()
Exemple #3
0
def main(params: dict):
    import mlflow
    logger = get_logger()
    print("start params={}".format(params))
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle").head(30_000_000)
    df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(30000)
    df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)

    df = df[["user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly"]]

    train_idx = []
    val_idx = []
    np.random.seed(0)
    for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"):
        if np.random.random() < 0.1:
            # all val
            val_idx.extend(w_df.index.tolist())
        else:
            train_num = int(len(w_df) * 0.9)
            train_idx.extend(w_df[:train_num].index.tolist())
            val_idx.extend(w_df[train_num:].index.tolist())

    df["is_val"] = 0
    df["is_val"].loc[val_idx] = 1
    w_df = df[df["is_val"] == 0]
    w_df["group"] = (w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"]
    w_df["user_id"] = w_df["user_id"].astype(str) + "_" + w_df["group"].astype(str)
    ff_for_transformer = FeatureFactoryForTransformer(column_config={("content_id", "content_type_id"): {"type": "category"},
                                                                     "user_answer": {"type": "category"},
                                                                     "part": {"type": "category"}},
                                                      dict_path="../feature_engineering/",
                                                      sequence_length=params["max_seq"],
                                                      logger=logger)
    group = ff_for_transformer.all_predict(w_df)

    n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")])
    print(group)
    dataset_train = SAKTDataset(group,
                                n_skill=n_skill,
                                max_seq=params["max_seq"])

    ff_for_transformer = FeatureFactoryForTransformer(column_config={("content_id", "content_type_id"): {"type": "category"},
                                                                     "user_answer": {"type": "category"},
                                                                     "part": {"type": "category"}},
                                                      dict_path="../feature_engineering/",
                                                      sequence_length=params["max_seq"],
                                                      logger=logger)
    group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0])
    dataset_val = SAKTDataset(group,
                              is_test=True,
                              n_skill=n_skill,
                              max_seq=params["max_seq"])

    dataloader_train = DataLoader(dataset_train, batch_size=64, shuffle=True, num_workers=1)
    dataloader_val = DataLoader(dataset_val, batch_size=64, shuffle=False, num_workers=1)

    model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"])
    optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"])
    criterion = nn.BCEWithLogitsLoss()

    model.to(device)
    criterion.to(device)

    for epoch in range(epochs):
        loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, device)
        print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".format(epoch, loss, auc, auc_val))

    preds = []
    labels = []
    for d in tqdm(dataloader_val):
        x = d[0].to(device).long()
        target_id = d[1].to(device).long()
        part = d[2].to(device).long()
        label = d[3].to(device).long()

        output, atten_weight = model(x, target_id, part)

        preds.extend(torch.nn.Sigmoid()(output[:, -1]).view(-1).data.cpu().numpy().tolist())
        labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist())

    df_oof = pd.DataFrame()
    df_oof["row_id"] = df.loc[val_idx].index
    df_oof["predict"] = preds
    df_oof["target"] = df.loc[val_idx]["answered_correctly"].values

    df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False)

    df_oof2 = pd.read_csv("../output/ex_172/20201202080625/oof_train_0_lgbm.csv")
    df_oof2.columns = ["row_id", "predict_lgbm", "target"]
    df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

    auc_transformer = roc_auc_score(df_oof2["target"].values, df_oof2["predict"].values)
    auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values)
    print("single transformer: {:.4f}".format(auc_transformer))
    print("lgbm: {:.4f}".format(auc_lgbm))

    print("ensemble")
    max_auc = 0
    max_nn_ratio = 0
    for r in np.arange(0, 1.05, 0.05):
        auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r)
        print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc))

        if max_auc < auc:
            max_auc = auc
            max_nn_ratio = r
    print(len(df_oof2))

    if not is_debug:
        mlflow.start_run(experiment_id=10,
                         run_name=os.path.basename(__file__))

        mlflow.log_param("count_row", len(df))

        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("auc_val", auc_transformer)
        mlflow.log_metric("auc_lgbm", auc_lgbm)
        mlflow.log_metric("auc_ensemble", max_auc)
        mlflow.log_metric("ensemble_nn_ratio", max_nn_ratio)
        mlflow.end_run()
    torch.save(model.state_dict(), f"{output_dir}/transformers.pth")
    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
    if is_make_feature_factory:
        ff_for_transformer = FeatureFactoryForTransformer(column_config={("content_id", "content_type_id"): {"type": "category"},
                                                                          "part": {"type": "category"}},
                                                          dict_path="../feature_engineering/",
                                                          sequence_length=params["max_seq"],
                                                          logger=logger)
        df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle")
        if is_debug:
            df = df.head(10000)
        df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
        ff_for_transformer.fit(df)
        ff_for_transformer.logger = None
        with open(f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f:
            pickle.dump(ff_for_transformer, f)
Exemple #4
0
def main(params: dict, output_dir: str):
    import mlflow
    print("start params={}".format(params))
    logger = get_logger()
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle")
    df = pd.read_pickle(
        "../input/riiid-test-answer-prediction/split10/train_0.pickle"
    ).sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(30000)
    df["prior_question_had_explanation"] = df[
        "prior_question_had_explanation"].fillna(-1)
    column_config = {
        ("content_id", "content_type_id"): {
            "type": "category"
        },
        "user_answer": {
            "type": "category"
        },
        "part": {
            "type": "category"
        },
        "prior_question_elapsed_time_bin300": {
            "type": "category"
        },
        "duration_previous_content_bin300": {
            "type": "category"
        },
        "prior_question_had_explanation": {
            "type": "category"
        },
        "rating_diff_content_user_id": {
            "type": "numeric"
        }
    }

    if not load_pickle or is_debug:
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent()
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_dict["user_id"][
            "UserContentRateEncoder"] = UserContentRateEncoder(
                rate_func="elo", column="user_id")
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="train_0",
            load_feature=not is_debug,
            save_feature=not is_debug)

        print("all_predict")
        df = feature_factory_manager.all_predict(df)
        df = df[[
            "user_id", "content_id", "content_type_id", "part", "user_answer",
            "answered_correctly", "prior_question_elapsed_time_bin300",
            "duration_previous_content_bin300",
            "prior_question_had_explanation", "rating_diff_content_user_id"
        ]]
        print(df.head(10))

        print("data preprocess")

        train_idx = []
        val_idx = []
        np.random.seed(0)
        for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"):
            if np.random.random() < 0.01:
                # all val
                val_idx.extend(w_df.index.tolist())
            else:
                train_num = int(len(w_df) * 0.95)
                train_idx.extend(w_df[:train_num].index.tolist())
                val_idx.extend(w_df[train_num:].index.tolist())
    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    ff_for_transformer.make_dict(df=df)
    n_skill = len(ff_for_transformer.embbed_dict[("content_id",
                                                  "content_type_id")])
    if not load_pickle or is_debug:
        df["is_val"] = 0
        df["is_val"].loc[val_idx] = 1
        w_df = df[df["is_val"] == 0]
        w_df["group"] = (
            w_df.groupby("user_id")["user_id"].transform("count") -
            w_df.groupby("user_id").cumcount()) // params["max_seq"]
        w_df["user_id"] = w_df["user_id"].astype(
            str) + "_" + w_df["group"].astype(str)

        group = ff_for_transformer.all_predict(w_df)

        dataset_train = SAKTDataset(group,
                                    n_skill=n_skill,
                                    max_seq=params["max_seq"])

        del w_df
        gc.collect()

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    if not load_pickle or is_debug:
        group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0])
        dataset_val = SAKTDataset(group,
                                  is_test=True,
                                  n_skill=n_skill,
                                  max_seq=params["max_seq"])

    os.makedirs("../input/feature_engineering/model075", exist_ok=True)
    if not is_debug and not load_pickle:
        with open(f"../input/feature_engineering/model075/train.pickle",
                  "wb") as f:
            pickle.dump(dataset_train, f)
        with open(f"../input/feature_engineering/model075/val.pickle",
                  "wb") as f:
            pickle.dump(dataset_val, f)

    if not is_debug and load_pickle:
        with open(f"../input/feature_engineering/model075/train.pickle",
                  "rb") as f:
            dataset_train = pickle.load(f)
        with open(f"../input/feature_engineering/model075/val.pickle",
                  "rb") as f:
            dataset_val = pickle.load(f)
        print("loaded!")
    dataloader_train = DataLoader(dataset_train,
                                  batch_size=params["batch_size"],
                                  shuffle=True,
                                  num_workers=1)
    dataloader_val = DataLoader(dataset_val,
                                batch_size=params["batch_size"],
                                shuffle=False,
                                num_workers=1)

    model = SAKTModel(n_skill,
                      embed_dim=params["embed_dim"],
                      max_seq=params["max_seq"],
                      dropout=dropout)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=params["lr"],
        weight_decay=0.01,
    )
    num_train_optimization_steps = int(len(dataloader_train) * epochs)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=params["num_warmup_steps"],
        num_training_steps=num_train_optimization_steps)
    criterion = nn.BCEWithLogitsLoss()

    model.to(device)
    criterion.to(device)

    for epoch in range(epochs):
        loss, acc, auc, auc_val = train_epoch(model, dataloader_train,
                                              dataloader_val, optimizer,
                                              criterion, scheduler, device)
        print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".
              format(epoch, loss, auc, auc_val))

    preds = []
    labels = []
    for item in tqdm(dataloader_val):
        x = item["x"].to(device).long()
        target_id = item["target_id"].to(device).long()
        part = item["part"].to(device).long()
        label = item["label"].to(device).float()
        elapsed_time = item["elapsed_time"].to(device).long()
        duration_previous_content = item["duration_previous_content"].to(
            device).long()
        prior_question_had_explanation = item["prior_q"].to(device).long()
        user_answer = item["user_answer"].to(device).long()
        rate_diff = item["rate_diff"].to(device).float()

        output = model(x, target_id, part, elapsed_time,
                       duration_previous_content,
                       prior_question_had_explanation, user_answer, rate_diff)

        preds.extend(torch.nn.Sigmoid()(
            output[:, -1]).view(-1).data.cpu().numpy().tolist())
        labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist())

    auc_transformer = roc_auc_score(labels, preds)
    print("single transformer: {:.4f}".format(auc_transformer))
    df_oof = pd.DataFrame()
    # df_oof["row_id"] = df.loc[val_idx].index
    print(len(dataloader_val))
    print(len(preds))
    df_oof["predict"] = preds
    df_oof["target"] = labels

    df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False)
    """
    df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv")
    df_oof2.columns = ["row_id", "predict_lgbm", "target"]
    df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

    auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values)
    print("lgbm: {:.4f}".format(auc_lgbm))

    print("ensemble")
    max_auc = 0
    max_nn_ratio = 0
    for r in np.arange(0, 1.05, 0.05):
        auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r)
        print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc))

        if max_auc < auc:
            max_auc = auc
            max_nn_ratio = r
    print(len(df_oof2))
    """
    if not is_debug:
        mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__))

        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("auc_val", auc_transformer)
        mlflow.end_run()
    torch.save(model.state_dict(), f"{output_dir}/transformers.pth")
    del model
    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
    if is_make_feature_factory:
        # feature factory
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="all",
            load_feature=not is_debug,
            save_feature=not is_debug)

        ff_for_transformer = FeatureFactoryForTransformer(
            column_config=column_config,
            dict_path="../feature_engineering/",
            sequence_length=params["max_seq"],
            logger=logger)
        df = pd.read_pickle(
            "../input/riiid-test-answer-prediction/train_merged.pickle")
        if is_debug:
            df = df.head(10000)
        df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
        feature_factory_manager.fit(df)
        df = feature_factory_manager.all_predict(df)
        for dicts in feature_factory_manager.feature_factory_dict.values():
            for factory in dicts.values():
                factory.logger = None
        feature_factory_manager.logger = None
        with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f:
            pickle.dump(feature_factory_manager, f)

        ff_for_transformer.fit(df)
        ff_for_transformer.logger = None
        with open(
                f"{output_dir}/feature_factory_manager_for_transformer.pickle",
                "wb") as f:
            pickle.dump(ff_for_transformer, f)
Exemple #5
0
def start_run():
    mlflow.start_run()
    yield
    mlflow.end_run()
Exemple #6
0
 def after_pipeline_run(self) -> None:
     """Hook implementation to end the MLflow run
     after the Kedro pipeline finishes.
     """
     mlflow.end_run()
Exemple #7
0
 def on_train_end(self, args, state, control, **kwargs):
     if self._initialized and state.is_world_process_zero:
         if self._log_artifacts:
             logger.info("Logging artifacts. This may take time.")
             mlflow.log_artifacts(args.output_dir)
         mlflow.end_run()
    def train(self, config: ConfigurationNode = None):
        """
        Take a configuration node and train the model from it.
        :param config:
        :return:
        """
        if config is None:
            config = self.config
        # Create writable timestamp for easier record keeping
        timestamp = datetime.now().isoformat(sep="T", timespec="auto")
        name_timestamp = timestamp.replace(":", "_")

        # Start the mlflow run:
        mlflow.start_run(run_name=name_timestamp)

        # Check valid output path, set path from the path_cfg_override modules respectively
        assert config.OUTPUT_PATH != ''
        path_output = config.OUTPUT_PATH  # output folder
        path_train = config.DATASET.TRAIN_DATA_PATH  # training data folder
        path_val = config.DATASET.VAL_DATA_PATH  # validation data folder

        # Make output dir and its parents if not exist.
        if not os.path.exists(path_output):
            os.makedirs(path_output)

        # Make result folders if they do not exist.
        self.results_dir = (Path(path_output) / name_timestamp)
        if not os.path.exists(self.results_dir):
            os.makedirs(self.results_dir)

        # Make backup folders if they do not exist.
        self.backup_dir = os.path.join(self.results_dir, 'model_backups')
        if not os.path.exists(self.backup_dir):
            os.makedirs(self.backup_dir)

        writer_tensorboard = SummaryWriter(log_dir=Path(self.results_dir /
                                                        "logs_tensorflow"))

        # Now that CFG has been properly merged with new data along the way, time to dump a version of it into a string for trackability purposes.
        config.dump(stream=open(
            os.path.join(self.results_dir, f'config{name_timestamp}.yaml'),
            'w'))

        # file path to store the state of the model.
        state_fpath = os.path.join(self.results_dir,
                                   f'model{name_timestamp}.pt')

        # ????
        perf_path = os.path.join(self.results_dir, f'trace{name_timestamp}.p')
        perf_trace = []

        # Load data, create the data loader objects from them.
        data_train = pickle.load(open(path_train, 'rb'))
        data_val = pickle.load(open(path_val, 'rb'))
        self.loader_train = build_data_loader(data_train, config.DATASET, True)
        self.loader_val = build_data_loader(data_val, config.DATASET, False)

        # Build the model using configue dict node
        self.model = build_model(config.MODEL)

        # Enable parallel multi GPU mode if the config specify it.
        if config.MODEL.PARALLEL:
            print("Utilized parallel processing")
            self.model = torch.nn.DataParallel(self.model)

        current_epoch = 0

        # For resuming training (i.e. load checkpoint)
        if config.RESUME_PATH != "":
            checkpoint = torch.load(config.RESUME_PATH, map_location='cpu')
            current_epoch = checkpoint['epoch']
            self.model.load_state_dict(checkpoint["model_state"])
        _ = self.model.cuda()

        # SOLVER EVALUATOR
        cfg_solver = config.MODEL.SOLVER

        # Build optimizer (between train/validation, using the solver portion of the configuration.
        optimizer = build_optimizer(self.model, cfg_solver)

        # Build evaluator (between train/validation, using the solver portion of the configuration.
        evaluator = build_evaluator(cfg_solver)

        evaluator.float().cuda()
        total_epochs = cfg_solver.TOTAL_EPOCHS

        # Main training epoch loop starts here.
        for epoch in range(current_epoch, total_epochs):

            # Train a single epoch
            self.train_epoch(epoch, evaluator, optimizer, perf_path,
                             perf_trace, state_fpath, writer_tensorboard)

        mlflow.end_run()
Exemple #9
0
    def run_train_cv(self) -> None:
        """クロスバリデーションでの学習・評価を行う

        学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う
        """
        # mlflow
        mlflow.set_experiment(self.exp_name)
        mlflow.start_run(run_name=self.run_name)
        logger.info(f'{self.run_name} - start training cv')

        scores = []
        va_idxes = []
        preds = []

        # Adversarial validation
        if self.advanced and 'adversarial_validation' in self.advanced:
            X_train = self.X_train
            X_test = self.X_test
            X_train['target'] = 0
            X_test['target'] = 1
            X_train = pd.concat([X_train, X_test],
                                sort=False).reset_index(drop=True)
            y_train = X_train['target']
            X_train.drop('target', axis=1, inplace=True)
            X_test.drop('target', axis=1, inplace=True)
            self.X_train = X_train
            self.y_train = y_train

        # 各foldで学習を行う
        for i_fold in range(self.cv.n_splits):
            # 学習を行う
            logger.info(f'{self.run_name} fold {i_fold} - start training')
            model, va_idx, va_pred, score = self.train_fold(i_fold)
            logger.info(
                f'{self.run_name} fold {i_fold} - end training - score {score}'
            )

            # モデルを保存する
            model.save_model()

            # 結果を保持する
            va_idxes.append(va_idx)
            scores.append(score)
            preds.append(va_pred)

        # 各foldの結果をまとめる
        va_idxes = np.concatenate(va_idxes)
        order = np.argsort(va_idxes)
        preds = np.concatenate(preds, axis=0)
        preds = preds[order]

        if self.evaluation_metric == 'log_loss':
            cv_score = log_loss(self.y_train, preds, eps=1e-15, normalize=True)
        elif self.evaluation_metric == 'mean_absolute_error':
            cv_score = mean_absolute_error(self.y_train, preds)
        elif self.evaluation_metric == 'rmse':
            cv_score = np.sqrt(mean_squared_error(self.y_train, preds))
        elif self.evaluation_metric == 'auc':
            cv_score = roc_auc_score(self.y_train, preds)
        elif self.evaluation_metric == 'prauc':
            cv_score = average_precision_score(self.y_train, preds)

        logger.info(f'{self.run_name} - end training cv - score {cv_score}')

        # 予測結果の保存
        Data.dump(preds, f'../output/pred/{self.run_name}-train.pkl')

        # mlflow
        self.run_id = mlflow.active_run().info.run_id
        log_param('model_name', str(self.model_cls).split('.')[-1][:-2])
        log_param('fe_name', self.fe_name)
        log_param('train_params', self.params)
        log_param('cv_strategy', str(self.cv))
        log_param('evaluation_metric', self.evaluation_metric)
        log_metric('cv_score', cv_score)
        log_param(
            'fold_scores',
            dict(
                zip([f'fold_{i}' for i in range(len(scores))],
                    [round(s, 4) for s in scores])))
        log_param('cols_definition', self.cols_definition)
        log_param('description', self.description)
        mlflow.end_run()
Exemple #10
0
def main(args: DictConfig):

    # Non-strict access to fields
    OmegaConf.set_struct(args, False)
    args.exp.pop('rfi')

    # Adding default estimator params
    default_names, _, _, default_values, _, _, _ = \
        inspect.getfullargspec(instantiate(args.estimator, context_size=0).__class__.__init__)
    if default_values is not None:
        args.estimator['defaults'] = {
            n: str(v)
            for (n, v) in zip(
                default_names[len(default_names) -
                              len(default_values):], default_values)
        }
    logger.info(OmegaConf.to_yaml(args, resolve=True))

    # Data-generating DAG
    data_path = hydra.utils.to_absolute_path(
        f'{ROOT_PATH}/{args.data.relative_path}')
    exp_name = args.data.relative_path.split('/')[-1]
    adjacency_matrix = np.load(
        f'{data_path}/DAG{args.data.sample_ind}.npy').astype(int)
    if exp_name == 'sachs_2005':
        var_names = np.load(f'{data_path}/sachs-header.npy')
    else:
        var_names = [f'x{i}' for i in range(len(adjacency_matrix))]
    dag = DirectedAcyclicGraph(adjacency_matrix, var_names)

    # Experiment tracking
    exp_name = f'sage/{exp_name}'
    mlflow.set_tracking_uri(args.exp.mlflow_uri)
    mlflow.set_experiment(exp_name)

    # Checking if run exist
    if check_existing_hash(args, exp_name):
        logger.info('Skipping existing run.')
        return
    else:
        logger.info('No runs found - perfoming one.')

    # Loading Train-test data
    data = np.load(f'{data_path}/data{args.data.sample_ind}.npy')
    if args.data.standard_normalize:
        if 'normalise_params' in args.data:
            standard_normalizer = StandardScaler(**args.data.normalise_params)
        else:
            standard_normalizer = StandardScaler()
        data = standard_normalizer.fit_transform(data)
    data_train, data_test = train_test_split(data,
                                             test_size=args.data.test_ratio,
                                             random_state=args.data.split_seed)
    train_df = pd.DataFrame(data_train, columns=dag.var_names)
    test_df = pd.DataFrame(data_test, columns=dag.var_names)

    mlflow.start_run()
    mlflow.log_params(flatten_dict(args))
    mlflow.log_param('data_generator/dag/n', len(var_names))
    mlflow.log_param('data_generator/dag/m', int(adjacency_matrix.sum()))
    mlflow.log_param('data/n_train', len(train_df))
    mlflow.log_param('data/n_test', len(test_df))

    # Saving artifacts
    train_df.to_csv(
        hydra.utils.to_absolute_path(f'{mlflow.get_artifact_uri()}/train.csv'),
        index=False)
    test_df.to_csv(
        hydra.utils.to_absolute_path(f'{mlflow.get_artifact_uri()}/test.csv'),
        index=False)
    dag.plot_dag()
    plt.savefig(
        hydra.utils.to_absolute_path(f'{mlflow.get_artifact_uri()}/dag.png'))

    mlflow.log_param('features_sequence', str(list(dag.var_names)))

    for var_ind, target_var in enumerate(dag.var_names):

        var_results = {}

        # Considering all the variables for input
        input_vars = [var for var in dag.var_names if var != target_var]
        y_train, X_train = train_df.loc[:,
                                        target_var], train_df.loc[:,
                                                                  input_vars]
        y_test, X_test = test_df.loc[:, target_var], test_df.loc[:, input_vars]

        # Initialising risks
        risks = {}
        for risk in args.predictors.risks:
            risks[risk] = getattr(importlib.import_module('sklearn.metrics'),
                                  risk)

        # Fitting predictive model
        models = {}
        for pred_model in args.predictors.pred_models:
            logger.info(
                f'Fitting {pred_model._target_} for target = {target_var} and inputs {input_vars}'
            )
            model = instantiate(pred_model)
            model.fit(X_train.values, y_train.values)
            y_pred = model.predict(X_test.values)
            models[pred_model._target_] = model
            for risk, risk_func in risks.items():
                var_results[f'test_{risk}_{pred_model._target_}'] = risk_func(
                    y_test.values, y_pred)

        # =================== Global SAGE ===================
        logger.info(f'Analysing the importance of features: {input_vars}')

        sampler = instantiate(args.estimator.sampler,
                              X_train=X_train,
                              fit_method=args.estimator.fit_method,
                              fit_params=args.estimator.fit_params)

        log_lik = []
        sage_explainer = explainer.Explainer(None,
                                             input_vars,
                                             X_train,
                                             sampler=sampler,
                                             loss=None)
        # Generating the same orderings across all the models and losses
        np.random.seed(args.exp.sage.orderings_seed)
        fixed_orderings = [
            np.random.permutation(input_vars)
            for _ in range(args.exp.sage.nr_orderings)
        ]

        for model_name, model in models.items():
            for risk, risk_func in risks.items():
                sage_explainer.model = model.predict
                explanation, test_log_lik = sage_explainer.sage(
                    X_test,
                    y_test,
                    loss=risk_func,
                    fixed_orderings=fixed_orderings,
                    nr_runs=args.exp.sage.nr_runs,
                    return_test_log_lik=True,
                    nr_resample_marginalize=args.exp.sage.
                    nr_resample_marginalize)
                log_lik.extend(test_log_lik)
                fi = explanation.fi_vals().mean()

                for fsoi, input_var in enumerate(input_vars):
                    var_results[
                        f'sage/mean_{risk}_{model_name}_{input_var}'] = fi[
                            input_var]

        var_results['sage/mean_log_lik'] = np.mean(log_lik)
        var_results['sage/num_fitted_estimators'] = len(log_lik)

        mlflow.log_metrics(var_results, step=var_ind)

    mlflow.end_run()
Exemple #11
0
                  tensorboard=tensorboard,
                  valid_graph_path=valid_graph_path,
                  valid_html_auto_open=valid_html_auto_open,
                  using_mlflow=using_mlflow,

                  # valid dataset 그리기
                  decode_number=decode_number,
                  multiperclass=multiperclass,
                  nms_thresh=nms_thresh,
                  nms_topk=nms_topk,
                  iou_thresh=iou_thresh,
                  except_class_thresh=except_class_thresh,
                  plot_class_thresh=plot_class_thresh)

        if using_mlflow:
            ml.end_run()
    else:
        test.run(mean=image_mean,
                 std=image_std,
                 load_name=load_name, load_period=load_period, GPU_COUNT=GPU_COUNT,
                 test_weight_path=test_weight_path,
                 test_dataset_path=test_dataset_path, num_workers=num_workers,
                 test_save_path=test_save_path,
                 test_graph_path=test_graph_path,
                 test_html_auto_open=test_html_auto_open,
                 foreground_iou_thresh=foreground_iou_thresh,
                 background_iou_thresh=background_iou_thresh,
                 show_flag=show_flag,
                 save_flag=save_flag,
                 # test dataset 그리기
                 decode_number=decode_number,
Exemple #12
0
def train_nn_cv(df: pd.DataFrame,
                model,
                params: dict,
                output_dir: str,
                model_id: int,
                exp_name: str,
                drop_user_id: bool,
                experiment_id: int=0,
                is_debug: bool=False):

    if not is_debug:
        mlflow.start_run(experiment_id=experiment_id, run_name=exp_name)

        mlflow.log_param("model_id", model_id)
        mlflow.log_param("count_row", len(df))
        mlflow.log_param("count_column", len(df.columns))

        for key, value in params.items():
            mlflow.log_param(key, value)

    if drop_user_id:
        features = [x for x in df.columns if x not in ["answered_correctly", "user_id"]]
    else:
        features = [x for x in df.columns if x not in ["answered_correctly"]]
    df_imp = pd.DataFrame()
    df_imp["feature"] = features

    train_idx = []
    val_idx = []
    np.random.seed(0)
    for _, w_df in df.groupby("user_id"):
        if np.random.random() < 0.01:
            # all val
            val_idx.extend(w_df.index.tolist())
        else:
            train_num = int(len(w_df) * 0.95)
            train_idx.extend(w_df[:train_num].index.tolist())
            val_idx.extend(w_df[train_num:].index.tolist())

    if is_debug:
        epochs = 3
    else:
        epochs = 1000

    model.fit(df[features].iloc[train_idx].values, df["answered_correctly"].iloc[train_idx].values.reshape(-1, 1),
              batch_size=2**17,
              epochs=epochs,
              verbose=True,
              validation_data=(df[features].iloc[val_idx].values,
                               df["answered_correctly"].iloc[val_idx].values.reshape(-1, 1)),
              callbacks=[EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=-1, mode='auto'),
                         ReduceLROnPlateau(monitor='val_loss', factor=0.25, patience=3, verbose=-1,
                                           mode='auto', epsilon=0.0001, cooldown=0, min_lr=0),
                         ModelCheckpoint(filepath=f"{output_dir}/best_nn_{model_id}.weight", monitor='val_loss', verbose=-1,
                                         save_best_only=True, mode='auto')])
    model = load_model(f"{output_dir}/best_nn_{model_id}.weight")
    pd.DataFrame(features, columns=["feature"]).to_csv(f"{output_dir}/nn_use_feature.csv", index=False)

    y_train = model.predict(df.iloc[train_idx][features])
    y_oof = model.predict(df.iloc[val_idx][features])

    auc_train = roc_auc_score(df.iloc[train_idx]["answered_correctly"].values.flatten(), y_train.flatten())
    auc_val = roc_auc_score(df.iloc[val_idx]["answered_correctly"].values.flatten(), y_oof.flatten())
    print(f"auc_train: {auc_train}, auc_val: {auc_val}")
    if not is_debug:
        mlflow.log_metric("auc_train", auc_train)
        mlflow.log_metric("auc_val", auc_val)
        mlflow.end_run()

    df_oof = pd.DataFrame()
    df_oof["row_id"] = df.iloc[val_idx].index
    df_oof["predict"] = y_oof
    df_oof["target"] = df.iloc[val_idx]["answered_correctly"].values

    df_oof.to_csv(f"{output_dir}/oof_{model_id}_nn.csv", index=False)
Exemple #13
0
def train_model_mlflow(model: tf.keras.models.Model,
                       train_gen: DataGenerator,
                       validation_gen: DataGenerator,
                       epochs=10,
                       steps=4000,
                       mlflow_server='http://0.0.0.0:8643',
                       checkpoints_path="checkpoints/run3"):
    # Configure output_dir
    output_dir = tempfile.mkdtemp()

    if mlflow_server:
        # Tracking URI
        if not mlflow_server.startswith("http"):
            mlflow_tracking_uri = 'http://' + mlflow_server + ':5000'
        else:
            mlflow_tracking_uri = mlflow_server
        # Set the Tracking URI
        mlflow.set_tracking_uri(mlflow_tracking_uri)
        print("MLflow Tracking URI: %s" % mlflow_tracking_uri)
    else:
        print("MLflow Tracking URI: %s" % "local directory 'mlruns'")

    # mlflow.tensorflow.autolog()
    # mlflow.keras.autolog()
    mlflow.set_experiment("/face-age-emotion-gender-detector")

    with mlflow.start_run():
        model_dir = "models/" + str(mlflow.active_run().info.run_uuid)
        # mlflow.log_artifacts("checkpoints/")
        mlflow.log_param('Epochs', str(epochs))
        mlflow.log_param('Steps', str(steps))
        x = str(model.summary())
        mlflow.log_param('model', x)
        mlflow.keras.log_model(model, 'models')
        tf.saved_model.save(model, model_dir)
        mlflow.log_artifact('./' + model_dir + "/saved_model.pb")
        mlflow.log_artifacts('./' + model_dir, artifact_path='models')

        optimizer = tf.keras.optimizers.Adadelta()
        global_steps = 0
        validation_steps = 0
        for epoch in range(epochs):
            for step in range(steps):
                global_steps += 1
                image_data, target_y_e, target_y_g, target_y_a = next(
                    train_gen.get_data())
                with tf.GradientTape() as tape:
                    pred_y_e, pred_y_g, pred_y_a = model(image_data)
                    l_e, l_g, l_a = compute_loss(pred_y_e, pred_y_g, pred_y_a,
                                                 target_y_e, target_y_g,
                                                 target_y_a)
                    total_loss = l_e + l_g + l_a
                    gradients = tape.gradient(total_loss,
                                              model.trainable_variables)
                    optimizer.apply_gradients(
                        zip(gradients, model.trainable_variables))
                    print("=> epoch %d  step %d  train_loss: %.6f" %
                          (epoch + 1, step + 1, total_loss.numpy()))

                    mlflow.log_metric("train/total_loss",
                                      total_loss.numpy(),
                                      step=global_steps)
                    mlflow.log_metric("train/emotion_loss",
                                      l_e.numpy(),
                                      step=global_steps)
                    mlflow.log_metric("train/gender_loss",
                                      l_g.numpy(),
                                      step=global_steps)
                    mlflow.log_metric("train/age_loss", l_a, step=global_steps)

                # validation step
                if step % 500 == 0:
                    validation_steps += 1
                    image_data, target_y_e, target_y_g, target_y_a = next(
                        validation_gen.get_data())
                    pred_y_e, pred_y_g, pred_y_a = model(image_data)
                    l_e, l_g, l_a = compute_loss(pred_y_e, pred_y_g, pred_y_a,
                                                 target_y_e, target_y_g,
                                                 target_y_a)
                    total_valid_loss = l_e + l_g + l_a
                    mlflow.log_metric("valid_loss",
                                      total_valid_loss.numpy(),
                                      step=validation_steps)

            mk_dir("checkpoints/")
            p_loss = int(round(total_loss.numpy(), 2) * 100)
            print(f"EGA_epoch_{epoch}_score_{p_loss}")
            # model.save(f"EGA_epoch_{epoch}_score_{p_loss}")
            # model.save_weights(f"EGA_epoch_{epoch}_score_{p_loss}.h5")

            mlflow.keras.save_model(model,
                                    "checkpoints/" + str(int(time.time())))

        mlflow.log_artifacts("checkpoints/")
        mlflow.end_run()
Exemple #14
0
def main(runname, expstatslog, mlflowlog, earlystop):
    if mlflowlog:
        pass
    else:
        global mlflow
        mlflow = dumbflow()
    if expstatslog:
        exp_status_write = open(EXP_STATUS, "a")
    else:
        exp_status_write = sys.stdout

    exp_status_write.write("\n\n\n\n")
    exp_status_write.write("--------------------------")
    exp_status_write.write("  BEGINNING NEW EXECUTION (" + runname + ") AT " +
                           str(time_utils.readable_time("%Y-%m-%d %H:%M:%S")))
    exp_status_write.write("  ------------------------" + "\n\n")
    # We are tracking drift adaptivity
    # namely labeled drift detection

    # Set up explicit drift detection params
    explicit_drift_param_grid = {
        "allow_explicit_drift": [(True, "ExpDr")],
        "explicit_drift_class": [("LabeledDriftDetector", "LDD")],
        "explicit_drift_mode": [("PageHinkley", "PageHinkley"),
                                ("ADWIN", "ADWIN"), ("EDDM", "EDDM"),
                                ("DDM", "DDM")],
        "explicit_update_mode": [("all", "A"), ("errors", "E")],
        "allow_unlabeled_drift": [(False, "")],
        "allow_update_schedule": [(False, "")],
        "weight_method": [("unweighted", "U"), ("performance", "P")],
        "select_method": [("recent", "RR"), ("recent-new", "RN"),
                          ("recent-updates", "RU")],
        "filter_method": [("no-filter", "F"), ("top-k", "T"),
                          ("nearest", "N")],
        "kval": [(5, "5"), (10, "10")]
    }
    explicit_drift_params = ParameterGrid(explicit_drift_param_grid)

    for param_set in explicit_drift_params:
        # This is an experiment
        if param_set["explicit_update_mode"][0] == "all":
            continue
        # Load up configuration file
        mlepConfig = io_utils.load_json('./MLEPServer.json')

        # Update config file and generate an experiment name
        experiment_name = ''
        for _param in param_set:
            if param_set[_param][1] != "":
                experiment_name += param_set[_param][1] + '-'
            mlepConfig["config"][_param] = param_set[_param][0]
        experiment_name = experiment_name[:-1]

        # Now we have the Experimental Coonfig we can use for running an experiment
        # generate an experiment name
        exp_status_write.write("--STATUS-- " + experiment_name + "   ")
        exp_status_write.flush()
        try:
            runExperiment(runname, mlepConfig, experiment_name, expstatslog,
                          earlystop)
            exp_status_write.write("SUCCESS\n")
        except Exception as e:
            exp_status_write.write("FAILED\n")
            exp_status_write.write(traceback.format_exc())
            exp_status_write.write(str(e))
            exp_status_write.write("\n")
            exp_status_write.flush()
            mlflow.end_run()
        exp_status_write.flush()

    exp_status_write.write("\n\n")
    exp_status_write.write("--------------------------")
    exp_status_write.write("  FINISHED EXECUTION OF (" + runname + ") AT " +
                           str(time_utils.readable_time("%Y-%m-%d %H:%M:%S")))
    exp_status_write.write("  ------------------------" + "\n\n")
    exp_status_write.close()
Exemple #15
0
def train_model(args, base_line=True):
    '''
    Train model function
    '''
    graph_label_loss = 'Baseline Model: Training and Validation Loss'
    graph_label_acc = 'Baseline Model: Training and Validation Accuracy'
    graph_image_loss_png = os.path.join(image_dir, 'baseline_loss.png')
    graph_image_acc_png = os.path.join(image_dir, 'baseline_accuracy.png')

    if not base_line:
        graph_label_loss = 'Experimental: Training and Validation Loss'
        graph_label_acc = 'Experimental Model: Training and Validation Accuracy'
        graph_image_loss_png = os.path.join(image_dir, 'experimental_loss.png')
        graph_image_acc_png = os.path.join(image_dir,
                                           'experimental_accuracy.png')

    image_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        validation_split=validation_split)

    train_generator = image_data_generator.flow_from_directory(
        TRAIN_DATA_DIR,
        target_size=(TRAIN_IMAGE_SIZE, TRAIN_IMAGE_SIZE),
        batch_size=TRAIN_BATCH_SIZE,
        class_mode='categorical',
        subset='training')

    validation_generator = image_data_generator.flow_from_directory(
        TRAIN_DATA_DIR,
        target_size=(TRAIN_IMAGE_SIZE, TRAIN_IMAGE_SIZE),
        batch_size=TRAIN_BATCH_SIZE,
        class_mode='categorical',
        subset='validation')

    # Create the model
    model = Sequential()

    model.add(
        Conv2D(args.filters,
               kernel_size=args.kernel_size,
               activation='relu',
               padding='same',
               input_shape=(img_width, img_height, img_num_channels)))
    model.add(Flatten())
    model.add(Dense(args.output, activation='softmax'))

    # Compile the model
    model.compile(loss=args.loss,
                  optimizer=args.optimizer,
                  metrics=['accuracy'])

    history = model.fit_generator(train_generator,
                                  epochs=args.epochs,
                                  validation_data=validation_generator)

    model.summary()

    print_metrics(history)
    figure_loss = plot_loss_graph(history, graph_label_loss)
    figure_loss.savefig(graph_image_loss_png)
    figure_acc = plot_accuracy_graph(history, graph_label_acc)
    figure_acc.savefig(graph_image_acc_png)
    # print('==================================================')
    # predictions = model.predict(TEST_DATA_DIR)
    # print(predictions)
    # print('==================================================')

    #mlflow.set_experiment(args.experiment_name)
    with mlflow.start_run():
        # print out current run_uuid
        run_uuid = mlflow.active_run().info.run_uuid
        print("MLflow Run ID: %s" % run_uuid)

        # mlflow.create_experiment("Training CNN Model", artifact_location=None)

        # log parameters
        mlflow.log_param("Filters", args.filters)
        mlflow.log_param("Kernel Size", args.kernel_size)
        mlflow.log_param("Output", args.output)
        mlflow.log_param("Epochs", args.epochs)
        mlflow.log_param("Loss", args.loss)
        mlflow.log_param("Optimize", args.optimizer)

        # calculate metrics
        binary_loss = get_binary_loss(history)
        binary_acc = get_binary_acc(history)
        validation_loss = get_validation_loss(history)
        validation_acc = get_validation_acc(history)

        # log metrics
        mlflow.log_metric("binary_loss", binary_loss)
        mlflow.log_metric("binary_acc", binary_acc)
        mlflow.log_metric("validation_loss", validation_loss)
        mlflow.log_metric("validation_acc", validation_acc)

        # log artifacts
        mlflow.log_artifacts(image_dir, "images")

        # log model
        mlflow.keras.log_model(model, "models")

        # save model locally
        pathdir = "../data/out/keras_models/" + run_uuid
        # keras_save_model(model, pathdir)

        # Write out TensorFlow events as a run artifact
        print("Uploading TensorFlow events as a run artifact.")
        mlflow.log_artifacts(output_dir, artifact_path="events")
        mlflow.end_run()
Exemple #16
0
 def reset_mlflow(self):
     mlflow.end_run()
Exemple #17
0
 def close(self, mlflow=False):
     for prefix, writer in self.writers.items():
         writer.flush()
         writer.close()
     if mlflow:
         module_mlflow.end_run()
def main():
    mlflow.start_run(run_name=NAME)

    if "X_train.pkl" not in os.listdir():
        print("procesando los datos")
        X, y, encoder = preprocess_data("TOTAL_TRAIN.csv", process_cat=False)
        print(X.shape)

        with open(f"label_encoder_{NAME}.pkl", "wb") as f:
            pickle.dump(encoder, f)
        print(
            f"##################### The shape of X is {X.shape} #######################"
        )
        y = y.astype("int")
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.15,
                                                            random_state=15,
                                                            stratify=y)
        with open("X_train.pkl", "wb") as f:
            pickle.dump(X_train, f)
        with open("X_test.pkl", "wb") as f:
            pickle.dump(X_test, f)
        with open("y_train.pkl", "wb") as f:
            pickle.dump(y_train, f)
        with open("y_test.pkl", "wb") as f:
            pickle.dump(y_test, f)

        print(X_train.shape)

    else:
        with open("X_train.pkl", "rb") as f:
            X_train = pickle.load(f)
        with open("X_test.pkl", "rb") as f:
            X_test = pickle.load(f)
        with open("y_train.pkl", "rb") as f:
            y_train = pickle.load(f)
        with open("y_test.pkl", "rb") as f:
            y_test = pickle.load(f)
        with open(f"label_encoder_XGB1704.pkl", "rb") as f:
            encoder = pickle.load(f)
        print("######### ajustando cat encoder ############")

    cols_cat = ["ruido", "CODIGO_POSTAL", "ZONA_METROPOLITANA", "CALIDAD_AIRE"]
    cols_float = [col for col in X_train.columns if col not in cols_cat]
    X_train[cols_float] = X_train[cols_float].astype("float")
    X_test[cols_float] = X_test[cols_float].astype("float")

    labs_names = [c for c in encoder.classes_]

    model = LGBMClassifier(
        class_weight="balanced",
        objective="multiclass:softmax",
        n_jobs=-1,
        random_state=100,
        silent=True,
    )

    if MODE != "INDIVIDUAL":
        params = {
            "reg_alpha": (1e-3, 5.0, "log-uniform"),
            "reg_lambda": (1e-2, 50.0, "log-uniform"),
            "n_estimators": (600, 4500),
            "learning_rate": (5e-3, 1.0, "log-uniform"),
            "num_leaves": (20, 80),
            "boosting_type": ["gbdt", "goss"],
            "colsample_bytree": (0.1, 1.0, "uniform"),
            "subsample": (0.1, 1.0, "uniform"),
            "min_child_samples": (1, 25),
            "min_child_weight": (1e-6, 0.1, "log-uniform"),
        }

        print(params)

        cb = CatBoostEncoder(cols=cols_cat)
        X_train = cb.fit_transform(X_train, y_train)
        X_test = cb.transform(X_test)
        fit_params = {
            ### fit params ###
            "eval_set": [(X_test, y_test)],
            "eval_metric": lgb_f1_score,
            "early_stopping_rounds": 300,
        }

        pipeline = Pipeline(steps=[("clas_encoder",
                                    CatBoostEncoder(
                                        cols=cols_cat)), ("model", model)])

        best_model = BayesSearchCV(
            model,
            params,
            n_iter=N_ITER,
            n_points=1,
            cv=cv,
            scoring=f2_scorer,
            random_state=100,
            optimizer_kwargs={"n_initial_points": 10},
            fit_params=fit_params,
        )

    def on_step(optim_result):
        score = best_model.best_score_
        results = best_model.cv_results_
        try:
            results_df = pd.DataFrame(results)
            results_df.to_csv(f"results_{NAME}.csv", header=True, index=False)
            print(
                f"############ Llevamos {results_df.shape[0]} pruebas #################"
            )
            print(f"los resultados del cv de momento son {results_df}")
        except:
            print("Unable to convert cv results to pandas dataframe")
        mlflow.log_metric("best_score", score)
        with open(f"./best_{NAME}_params.pkl", "wb") as f:
            pickle.dump(best_model.best_params_, f)

        print("best score: %s" % score)
        if score >= 0.98:
            print("Interrupting!")
            return True

    print("ajustando modelo")
    if MODE != "INDIVIDUAL":
        print(X_train.dtypes)
        best_model.fit(X_train, y_train, callback=[on_step])
        with open(f"./best_{NAME}_model.pkl", "wb") as f:
            pickle.dump(best_model, f)
        preds = best_model.predict(X_test)
    else:
        if NAME not in os.listdir():
            os.mkdir(NAME)

        cat_encoder = CatBoostEncoder(cols=cols_cat)
        X_train = cat_encoder.fit_transform(X_train, y_train)
        X_test = cat_encoder.transform(X_test)
        best_model = BalancedBaggingClassifier(
            base_estimator=HistGradientBoostingClassifier(
                max_iter=3000,
                random_state=42,
                learning_rate=0.1,
                max_leaf_nodes=54,
                min_samples_leaf=2,
                scoring=f2_scorer,
                validation_fraction=0.1,
                n_iter_no_change=50,
            ),
            n_estimators=5,
            random_state=42,
            n_jobs=-1,
            max_features=0.7,
            sampling_strategy={5: int(dict(Counter(y_train))[5] * 0.11)},
        )
        best_model.fit(X_train, y_train)
        preds = best_model.predict(X_test)
        print(
            f'F1 SCORE IS {f1_score(y_test, preds, average="macro")}, precision is {precision_score(y_test, preds, average="macro")}, recall is {recall_score(y_test, preds, average="macro")}, accuracy is {accuracy_score(y_test, preds)}'
        )
        print(
            f"F2 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}"
        )
        print(
            f"F05 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}"
        )
        cm = confusion_matrix(y_test, preds)
        grafico_conf_matrix = print_confusion_matrix(cm,
                                                     class_names=labs_names)
        grafico_conf_matrix.savefig(f"{NAME}/norm_NO_PIPELINE")

        with open(f"best_model_{NAME}.pkl", "wb") as f:
            pickle.dump(best_model, f)

    print("loggeando movidas")
    mlflow.log_metrics(
        metrics={
            "f1": f1_score(y_test, preds, average="macro"),
            "precision": precision_score(y_test, preds, average="macro"),
            "recall": recall_score(y_test, preds, average="macro"),
            "accuracy": accuracy_score(y_test, preds),
            "f05": fbeta_score(y_test, preds, beta=0.5, average="macro"),
            "f2": fbeta_score(y_test, preds, beta=2, average="macro"),
        })
    if MODE != "INDIVIDUAL":
        best_params = best_model.best_params_
        for param in best_params.keys():
            mlflow.log_param(param, best_params[param])
    cm = confusion_matrix(y_test, preds)
    grafico_conf_matrix = print_confusion_matrix(cm, class_names=labs_names)
    grafico_conf_matrix.savefig(NAME)
    grafico_norm = print_confusion_matrix(cm,
                                          class_names=labs_names,
                                          normalize=False)
    grafico_norm.savefig(f"{NAME}_no_norm")
    mlflow.end_run()
Exemple #19
0
def main_worker(gpu, ngpus_per_node, config):
    config['gpu'] = gpu

    # suppress printing if not master process
    if config['multiprocessing_distributed'] and config['gpu'] != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass

    if config['gpu'] is not None:
        print("Use GPU: {} for training".format(config['gpu']))

    if config['distributed']:
        if config['dist_url'] == "env://" and config['rank'] == -1:
            config['rank'] = int(os.environ["RANK"])
        if config['multiprocessing_distributed']:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            config['rank'] = config['rank'] * ngpus_per_node + gpu

        dist.init_process_group(backend=config['dist_backend'],
                                init_method=config['dist_url'],
                                world_size=config['world_size'],
                                rank=config['rank'])

    print("=> creating model '{}'".format(config['arch']))

    #hardcoding the resnet50 for the time being
    model = builder.MoCo(resnet50, config['moco_dim'], config['moco_k'],
                         config['moco_m'], config['moco_t'], config['mlp'])

    if config['distributed']:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if config['gpu'] is not None:
            torch.cuda.set_device(config['gpu'])
            model.cuda(config['gpu'])
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            config['batch_size'] = int(config['batch_size'] / ngpus_per_node)
            config['workers'] = int(
                (config['workers'] + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[config['gpu']])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif config['gpu'] is not None:
        torch.cuda.set_device(config['gpu'])
        model = model.cuda(config['gpu'])
        # comment out the following line for debugging
        raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        raise NotImplementedError("Only DistributedDataParallel is supported.")

    criterion = nn.CrossEntropyLoss().cuda(config['gpu'])

    optimizer = torch.optim.SGD(model.parameters(),
                                config['lr'],
                                momentum=config['momentum'],
                                weight_decay=config['weight_decay'])

    #set the start_epoch, overwritten if resuming
    config['start_epoch'] = 0

    # optionally resume from a checkpoint
    if config['resume']:
        if os.path.isfile(config['resume']):
            print("=> loading checkpoint '{}'".format(config['resume']))
            if config['gpu'] is None:
                checkpoint = torch.load(config['resume'])
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(config['gpu'])
                checkpoint = torch.load(config['resume'], map_location=loc)
            config['start_epoch'] = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                config['resume'], checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(config['resume']))

    cudnn.benchmark = True

    #get the mean and standard deviation pixels from config
    #and wrap them in lists for tf.Normalize to work
    norms = config['norms']
    mean_pixel = norms['mean']
    std_pixel = norms['std']
    normalize = tf.Normalize(mean=[mean_pixel], std=[std_pixel])

    #for now, these augmentations are hardcoded. torchvision
    #isn't as easy to work with as albumentations
    augmentation = tf.Compose([
        tf.Grayscale(3),
        tf.RandomApply([tf.RandomRotation(180)], p=0.5),
        tf.RandomResizedCrop(224, scale=(0.2, 1.)),
        tf.ColorJitter(0.4, 0.4, 0.4, 0.1),
        tf.RandomApply([GaussianBlur([.1, 2.])], p=0.5),
        tf.Grayscale(1),
        tf.RandomHorizontalFlip(),
        tf.RandomVerticalFlip(),
        tf.ToTensor(),
        GaussNoise(p=0.5), normalize
    ])

    train_dataset = EMData(config['data_file'], augmentation)

    if config['distributed']:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=config['batch_size'],
                                               shuffle=(train_sampler is None),
                                               num_workers=config['workers'],
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=True)

    #log parameters, if needed:
    if config['logging'] and (config['multiprocessing_distributed']
                              and config['rank'] % ngpus_per_node == 0):

        #end any old runs
        mlflow.end_run()
        mlflow.set_experiment(config['experiment_name'])
        mlflow.log_artifact(config['config_file'])

        #we don't want to add everything in the config
        #to mlflow parameters, we'll just add the most
        #likely to change parameters
        mlflow.log_param('data_file', config['data_file'])
        mlflow.log_param('architecture', config['arch'])
        mlflow.log_param('epochs', config['epochs'])
        mlflow.log_param('batch_size', config['batch_size'])
        mlflow.log_param('learning_rate', config['lr'])
        mlflow.log_param('moco_dim', config['moco_dim'])
        mlflow.log_param('moco_k', config['moco_k'])
        mlflow.log_param('moco_m', config['moco_m'])
        mlflow.log_param('moco_t', config['moco_t'])

    for epoch in range(config['start_epoch'], config['epochs']):
        if config['distributed']:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, config)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, config)

        #only save checkpoints from the main process
        if not config['multiprocessing_distributed'] or (
                config['multiprocessing_distributed']
                and config['rank'] % ngpus_per_node == 0):
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': config['arch'],
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'norms': [mean_pixel, std_pixel],
                },
                is_best=False,
                filename=os.path.join(config['model_dir'], 'current.pth.tar'))

            #save checkpoint every save_freq epochs
            if (epoch + 1) % config['save_freq'] == 0:
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'arch': config['arch'],
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'norms': [mean_pixel, std_pixel],
                    },
                    is_best=False,
                    filename=os.path.join(
                        config['model_dir'] +
                        'checkpoint_{:04d}.pth.tar'.format(epoch + 1)))
Exemple #20
0
    def close(self):
        import mlflow

        mlflow.end_run()
Exemple #21
0
 def __del__(self):
     # if the previous run is not terminated correctly, the fluent API will
     # not let you start a new run before the previous one is killed
     if mlflow.active_run is not None:
         mlflow.end_run(status="KILLED")
Exemple #22
0
def train(
        main_options: MainOptions,
        train_options: TrainOptions
) -> None:
    assert train_options.dim == 2 or train_options.dim == 3, \
        "Only 2D is supported at the moment " \
        "for data loading and observation / transition. " \
        "See torchvision.datasets.ImageFolder"

    output_dir = train_options.output_dir

    model_dir = "models"
    if not exists(join(output_dir, model_dir)):
        mkdir(join(output_dir, model_dir))
    if exists(join(output_dir, model_dir)) \
            and not isdir(join(output_dir, model_dir)):
        raise Exception(f"\"{join(output_dir, model_dir)}\""
                        f"is not a directory.")

    exp_name = "MARLClassification"
    mlflow.set_experiment(exp_name)

    mlflow.start_run(run_name=f"train_{main_options.run_id}")

    mlflow.log_param("output_dir", output_dir)
    mlflow.log_param("model_dir", join(output_dir, model_dir))

    img_pipeline = tr.Compose([
        tr.ToTensor(),
        custom_tr.NormalNorm()
    ])

    if train_options.ft_extr_str.startswith("resisc"):
        dataset_constructor = RESISC45Dataset
    elif train_options.ft_extr_str.startswith("mnist"):
        dataset_constructor = MNISTDataset
    else:
        dataset_constructor = KneeMRIDataset

    nn_models = ModelsWrapper(
        train_options.ft_extr_str,
        train_options.window_size,
        train_options.hidden_size_belief,
        train_options.hidden_size_action,
        train_options.hidden_size_msg,
        train_options.hidden_size_state,
        train_options.dim,
        train_options.action,
        train_options.nb_class,
        train_options.hidden_size_linear_belief,
        train_options.hidden_size_linear_action
    )

    dataset = dataset_constructor(img_pipeline)

    marl_m = MultiAgent(
        main_options.nb_agent,
        nn_models,
        train_options.hidden_size_belief,
        train_options.hidden_size_action,
        train_options.window_size,
        train_options.hidden_size_msg,
        train_options.action,
        obs_generic,
        trans_generic
    )

    mlflow.log_params({
        "ft_extractor": train_options.ft_extr_str,
        "window_size": train_options.window_size,
        "hidden_size_belief": train_options.hidden_size_belief,
        "hidden_size_action": train_options.hidden_size_action,
        "hidden_size_msg": train_options.hidden_size_msg,
        "hidden_size_state": train_options.hidden_size_state,
        "dim": train_options.dim,
        "action": train_options.action,
        "nb_class": train_options.nb_class,
        "hidden_size_linear_belief":
            train_options.hidden_size_linear_belief,
        "hidden_size_linear_action":
            train_options.hidden_size_linear_action,
        "nb_agent": main_options.nb_agent,
        "frozen_modules": train_options.frozen_modules,
        "epsilon": train_options.epsilon,
        "epsilon_decay": train_options.epsilon_decay,
        "nb_epoch": train_options.nb_epoch,
        "learning_rate": train_options.learning_rate,
        "img_size": train_options.img_size,
        "retry_number": train_options.retry_number,
        "step": main_options.step,
        "batch_size": train_options.batch_size
    })

    json_f = open(join(output_dir, "class_to_idx.json"), "w")
    json.dump(dataset.class_to_idx, json_f)
    json_f.close()
    mlflow.log_artifact(join(output_dir, "class_to_idx.json"))

    cuda = main_options.cuda
    device_str = "cpu"

    # Pass pytorch stuff to GPU
    # for agents hidden tensors (belief etc.)
    if cuda:
        nn_models.cuda()
        marl_m.cuda()
        device_str = "cuda"

    mlflow.log_param("device", device_str)

    module_to_train = ModelsWrapper.module_list \
        .difference(train_options.frozen_modules)

    # for RL agent models parameters
    optim = th.optim.Adam(
        nn_models.get_params(list(module_to_train)),
        lr=train_options.learning_rate
    )

    idx = th.randperm(len(dataset))
    idx_train = idx[:int(0.85 * idx.size(0))]
    idx_test = idx[int(0.85 * idx.size(0)):]

    train_dataset = Subset(dataset, idx_train)
    test_dataset = Subset(dataset, idx_test)

    train_dataloader = DataLoader(
        train_dataset, batch_size=train_options.batch_size,
        shuffle=True, num_workers=3, drop_last=False
    )

    test_dataloader = DataLoader(
        test_dataset, batch_size=train_options.batch_size,
        shuffle=True, num_workers=3, drop_last=False
    )

    epsilon = train_options.epsilon

    curr_step = 0

    for e in range(train_options.nb_epoch):
        nn_models.train()

        sum_loss = 0.
        i = 0

        conf_meter = ConfusionMeter(train_options.nb_class)

        tqdm_bar = tqdm(train_dataloader)
        for x_train, y_train in tqdm_bar:
            x_train, y_train = x_train.to(th.device(device_str)), \
                               y_train.to(th.device(device_str))

            # pred = [Nr, Ns, Nb, Nc]
            # prob = [Nr, Ns, Nb]
            retry_pred, retry_prob = episode_retry(
                marl_m, x_train, epsilon,
                main_options.step,
                train_options.retry_number,
                train_options.nb_class, device_str
            )

            # Class one hot encoding
            y_eye = th.eye(
                train_options.nb_class,
                device=th.device(device_str)
            )[y_train.unsqueeze(0)].unsqueeze(1).repeat(
                1, main_options.step, 1, 1)

            # Update confusion meter
            # mean between trials
            conf_meter.add(
                retry_pred.detach()[:, -1, :, :].mean(dim=0),
                y_train
            )

            # L2 Loss - Classification error / reward
            # reward = -error(y_true, y_step_pred).mean(class_dim)
            r = -th.pow(y_eye - retry_pred, 2.).mean(dim=-1)

            # Compute loss
            losses = retry_prob * r.detach() + r

            # Losses mean on images batch and trials
            # maximize(E[reward]) -> minimize(-E[reward])
            loss = -losses.mean()

            # Reset gradient
            optim.zero_grad()

            # Backward on compute graph
            loss.backward()

            # Update weights
            optim.step()

            # Update epoch loss sum
            sum_loss += loss.item()

            # Compute global score
            precs, recs = prec_rec(conf_meter)

            if curr_step % 100 == 0:
                mlflow.log_metrics(
                    {"loss": loss.item(),
                     "train_prec": precs.mean().item(),
                     "train_rec": recs.mean().item(),
                     "epsilon": epsilon},
                    step=curr_step
                )

            tqdm_bar.set_description(
                f"Epoch {e} - Train, "
                f"loss = {sum_loss / (i + 1):.4f}, "
                f"eps = {epsilon:.4f}, "
                f"train_prec = {precs.mean():.3f}, "
                f"train_rec = {recs.mean():.3f}"
            )

            epsilon *= train_options.epsilon_decay
            epsilon = max(epsilon, 0.)

            i += 1
            curr_step += 1

        sum_loss /= len(train_dataloader)

        save_conf_matrix(conf_meter, e, output_dir, "train")

        mlflow.log_artifact(
            join(output_dir, f"confusion_matrix_epoch_{e}_train.png")
        )

        nn_models.eval()
        conf_meter.reset()

        with th.no_grad():
            tqdm_bar = tqdm(test_dataloader)
            for x_test, y_test in tqdm_bar:
                x_test, y_test = x_test.to(th.device(device_str)), \
                                 y_test.to(th.device(device_str))

                preds, _ = episode(marl_m, x_test, 0., main_options.step)

                conf_meter.add(preds.detach(), y_test)

                # Compute score
                precs, recs = prec_rec(conf_meter)

                tqdm_bar.set_description(
                    f"Epoch {e} - Eval, "
                    f"eval_prec = {precs.mean():.4f}, "
                    f"eval_rec = {recs.mean():.4f}"
                )

        # Compute score
        precs, recs = prec_rec(conf_meter)

        save_conf_matrix(conf_meter, e, output_dir, "eval")

        mlflow.log_metrics(
            {"eval_prec": precs.mean(),
             "eval_recs": recs.mean()},
            step=curr_step
        )

        nn_models.json_args(
            join(output_dir,
                 model_dir,
                 f"marl_epoch_{e}.json")
        )
        th.save(
            nn_models.state_dict(),
            join(output_dir, model_dir,
                 f"nn_models_epoch_{e}.pt")
        )

        mlflow.log_artifact(
            join(output_dir,
                 model_dir,
                 f"marl_epoch_{e}.json")
        )
        mlflow.log_artifact(
            join(output_dir, model_dir,
                 f"nn_models_epoch_{e}.pt")
        )
        mlflow.log_artifact(
            join(output_dir,
                 f"confusion_matrix_epoch_{e}_eval.png")
        )

    empty_pipe = tr.Compose([
        tr.ToTensor()
    ])

    dataset_tmp = dataset_constructor(empty_pipe)

    test_dataloader_ori = Subset(dataset_tmp, idx_test)
    test_dataloader = Subset(dataset, idx_test)

    test_idx = randint(0, len(test_dataloader_ori))

    visualize_steps(
        marl_m, test_dataloader[test_idx][0],
        test_dataloader_ori[test_idx][0],
        main_options.step, train_options.window_size,
        output_dir, train_options.nb_class, device_str,
        dataset.class_to_idx
    )

    mlflow.end_run()
def main(model_config_module):
    model_config = importlib.import_module(model_config_module)

    logger.info(f"Loading data from {RAW_DATA_IN_PATH}")
    raw_dataframe = get_data(RAW_DATA_IN_PATH)

    logger.info(f"Splitting into {config.TRAIN_TEST_SPLIT_RATIO} train and {1-config.TRAIN_TEST_SPLIT_RATIO} test")
    raw_train, raw_test = train_test_split(raw_dataframe, config.TEST_SPLIT_DAYS)

    logger.info(f"Loading metadata from {META_DATA_IN_PATH}")
    meta_dataframe = get_data(META_DATA_IN_PATH)

    logger.info(f"Processing train dataset")
    processed_train_dataset = preprocess_train_data(raw_train, meta_dataframe)

    initialize_model = model_config.initialize_model
    grid = model_config.GRID

    #set experiment name
    logger.info(f"Starting MLFlow runs in experiment {config.EXPERIMENT_NAME}")
    mlflow.set_experiment(config.EXPERIMENT_NAME)

    logger.info(f"Train model with grid length of {len(grid)}")

    with mlflow.start_run(run_name=f"{model_config.RUN_NAME} grid search parent."):
        for params in grid:
            with mlflow.start_run(run_name=f'{model_config.RUN_NAME}: parameters: {params}', nested=True):
                logger.info(f"Train model with parameters: {params}.")
                mlflow.log_param("Parameters", params)

                init_model = initialize_model(params)
                model = train_model(init_model, processed_train_dataset)

                logger.info(f"Adding predictions")
                test_dataframe = add_prediction(test_dataset=raw_test,
                                                base_dataset=processed_train_dataset,
                                                meta_dataframe=meta_dataframe,
                                                model=model,
                                                predict_col_name=config.PREDICT)

                # Metrics
                logger.info(f"Logging metrics to MLFlow")
                metric = evaluate(test_dataframe) #Remember to change name from metric to test metric

                # MlFlow logs
                mlflow.log_metric("Root mean squared error", metric['root_mean_squared_error'])
                mlflow.log_metric("Mean squared error", metric['mean_squared_error'])
                mlflow.log_metric("Mean absolute error", metric['mean_absolute_error'])
                mlflow.log_metric("Mean absolute percentage error", metric['mean_absolute_percentage_error'])
                mlflow.log_metric("Absolute biggest deviation", metric['absolute_biggest_deviation'])

                # Plot
                logger.info(f"Logging timeserie graph to MLFlow")
                timeserie_plot(test_dataframe, config.DATE_COLUMN, PLOT_ACTUAL_VS_PREDICT_PLOT)

                # Log artifacts (output files)
                mlflow.log_artifact(str(PLOT_ACTUAL_VS_PREDICT_PLOT))

                logger.info(f"Saving model to {MODEL_PATH}")
                save_as_pickle(model, MODEL_PATH)

                logger.info(f"Saving test_dataframe to {TEST_DATAFRAME_PATH}")
                save_as_pickle(test_dataframe, TEST_DATAFRAME_PATH)

                mlflow.end_run()
Exemple #24
0
 def end_run(cls):
     mlflow.end_run()
def manual_run(request):
    if request.param:
        mlflow.start_run()
    yield
    mlflow.end_run()
Exemple #26
0
def main(params: dict,
         output_dir: str):
    import mlflow
    print("start params={}".format(params))
    model_id = "all"
    logger = get_logger()
    df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle")
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(30000)
    df["prior_question_had_explanation"] = df["prior_question_had_explanation"].fillna(-1)
    df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan)
    column_config = {
        ("content_id", "content_type_id"): {"type": "category"},
        "user_answer": {"type": "leakage_feature"},
        "answered_correctly": {"type": "leakage_feature"},
        "part": {"type": "category"},
        "prior_question_elapsed_time_bin300": {"type": "category"},
        "duration_previous_content_bin300": {"type": "category"},
        "prior_question_had_explanation": {"type": "category"},
        "rating_diff_content_user_id": {"type": "numeric"},
        "task_container_id_bin300": {"type": "category"},
        "previous_answer_index_content_id": {"type": "category"},
        "previous_answer_content_id": {"type": "category"},
        "timediff-elapsedtime_bin500": {"type": "category"}
    }


    if not load_pickle or is_debug:
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"]["DurationPreviousContent"] = DurationPreviousContent(is_partial_fit=True)
        feature_factory_dict["user_id"]["ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_dict["user_id"]["UserContentRateEncoder"] = UserContentRateEncoder(rate_func="elo",
                                                                                           column="user_id")
        feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(groupby="user_id",
                                                                             column="content_id",
                                                                             is_debug=is_debug,
                                                                             model_id=model_id,
                                                                             n=300)
        feature_factory_dict["user_id"]["StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True)
        feature_factory_dict["user_id"][f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator(column="user_id",
                                                                                             agg_column="study_time",
                                                                                             remove_now=False)

        feature_factory_dict["user_id"]["ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder()
        feature_factory_dict["post"] = {
            "DurationFeaturePostProcess": DurationFeaturePostProcess()
        }

        feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict,
                                                        logger=logger,
                                                        split_num=1,
                                                        model_id=model_id,
                                                        load_feature=not is_debug,
                                                        save_feature=not is_debug)
        print("all_predict")
        df = feature_factory_manager.all_predict(df)
        def f(x):
            x = x // 1000
            if x < -100:
                return -100
            if x > 400:
                return 400
            return x
        df["task_container_id_bin300"] = [x if x < 300 else 300 for x in df["task_container_id"]]
        df["timediff-elapsedtime_bin500"] = [f(x) for x in df["timediff-elapsedtime"].values]
        df = df[["user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly",
                 "prior_question_elapsed_time_bin300", "duration_previous_content_bin300",
                 "prior_question_had_explanation", "rating_diff_content_user_id", "task_container_id_bin300",
                 "previous_answer_index_content_id", "previous_answer_content_id", "row_id",
                 "timediff-elapsedtime_bin500"]]
        print(df.head(10))

        print("data preprocess")



    ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config,
                                                      dict_path="../feature_engineering/",
                                                      sequence_length=params["max_seq"],
                                                      logger=logger)
    ff_for_transformer.make_dict(df=df)
    n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")])

    if not load_pickle or is_debug:
        df_val_row = pd.read_feather("../input/riiid-test-answer-prediction/train_transformer_last2500k_only_row_id.feather")
        if is_debug:
            df_val_row = df_val_row.head(3000)
        df_val_row["is_val"] = 1

        df = pd.merge(df, df_val_row, how="left", on="row_id")
        df["is_val"] = df["is_val"].fillna(0)

        print(df["is_val"].value_counts())

        w_df = df[df["is_val"] == 0]
        w_df["group"] = (w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"]
        w_df["user_id"] = w_df["user_id"].astype(str) + "_" + w_df["group"].astype(str)

        group = ff_for_transformer.all_predict(w_df)

        dataset_train = SAKTDataset(group,
                                    n_skill=n_skill,
                                    max_seq=params["max_seq"])

        del w_df
        gc.collect()

    ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config,
                                                      dict_path="../feature_engineering/",
                                                      sequence_length=params["max_seq"],
                                                      logger=logger)
    if not load_pickle or is_debug:
        group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0])
        dataset_val = SAKTDataset(group,
                                  is_test=True,
                                  n_skill=n_skill,
                                  max_seq=params["max_seq"])

    os.makedirs("../input/feature_engineering/model232", exist_ok=True)
    if not is_debug and not load_pickle:
        with open(f"../input/feature_engineering/model232/train.pickle", "wb") as f:
            pickle.dump(dataset_train, f)
        with open(f"../input/feature_engineering/model232/val.pickle", "wb") as f:
            pickle.dump(dataset_val, f)

    if not is_debug and load_pickle:
        with open(f"../input/feature_engineering/model232/train.pickle", "rb") as f:
            dataset_train = pickle.load(f)
        with open(f"../input/feature_engineering/model232/val.pickle", "rb") as f:
            dataset_val = pickle.load(f)
        print("loaded!")
    dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True, num_workers=1)
    dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False, num_workers=1)

    model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout,
                      cont_emb=params["cont_emb"])

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=params["lr"],
                      weight_decay=0.01,
                      )
    num_train_optimization_steps = int(len(dataloader_train) * 20)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=params["num_warmup_steps"],
                                                num_training_steps=num_train_optimization_steps)
    criterion = nn.BCEWithLogitsLoss()

    model.to(device)
    criterion.to(device)

    for epoch in range(epochs):
        loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler,
                                              epoch, device)
        print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".format(epoch, loss, auc, auc_val))
        torch.save(model.state_dict(), f"{output_dir}/transformers_epoch{epoch}_auc{round(auc_val, 4)}.pth")

    preds = []
    labels = []
    with torch.no_grad():
        for item in tqdm(dataloader_val):
            label = item["label"].to(device).float()
            output = model(item, device)

            preds.extend(torch.nn.Sigmoid()(output[:, -1]).view(-1).data.cpu().numpy().tolist())
            labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist())

    auc_transformer = roc_auc_score(labels, preds)
    print("single transformer: {:.4f}".format(auc_transformer))
    df_oof = pd.DataFrame()
    # df_oof["row_id"] = df.loc[val_idx].index
    print(len(dataloader_val))
    print(len(preds))
    df_oof["predict"] = preds
    df_oof["target"] = labels

    df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False)
    """
    df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv")
    df_oof2.columns = ["row_id", "predict_lgbm", "target"]
    df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

    auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values)
    print("lgbm: {:.4f}".format(auc_lgbm))

    print("ensemble")
    max_auc = 0
    max_nn_ratio = 0
    for r in np.arange(0, 1.05, 0.05):
        auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r)
        print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc))

        if max_auc < auc:
            max_auc = auc
            max_nn_ratio = r
    print(len(df_oof2))
    """
    if not is_debug:
        mlflow.start_run(experiment_id=10,
                         run_name=os.path.basename(__file__))

        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("auc_val", auc_transformer)
        mlflow.end_run()
    torch.save(model.state_dict(), f"{output_dir}/transformers.pth")
    del model
    torch.cuda.empty_cache()
    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
    if is_make_feature_factory:
        # feature factory
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"]["DurationPreviousContent"] = DurationPreviousContent(is_partial_fit=True)
        feature_factory_dict["user_id"]["ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict,
                                                        logger=logger,
                                                        split_num=1,
                                                        model_id="all",
                                                        load_feature=not is_debug,
                                                        save_feature=not is_debug)

        ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config,
                                                          dict_path="../feature_engineering/",
                                                          sequence_length=params["max_seq"],
                                                          logger=logger)
        df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle")
        if is_debug:
            df = df.head(10000)
        df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
        feature_factory_manager.fit(df)
        df = feature_factory_manager.all_predict(df)
        for dicts in feature_factory_manager.feature_factory_dict.values():
            for factory in dicts.values():
                factory.logger = None
        feature_factory_manager.logger = None
        with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f:
            pickle.dump(feature_factory_manager, f)

        ff_for_transformer.fit(df)
        ff_for_transformer.logger = None
        with open(f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f:
            pickle.dump(ff_for_transformer, f)
 def after_fit(self):
     mlflow.end_run()
Exemple #28
0
def prexisting_run_id(tracking_uri):
    mlflow.set_tracking_uri(tracking_uri)
    mlflow.start_run()
    yield mlflow.active_run().info.run_id
    mlflow.end_run()
Exemple #29
0
#
# Code snippet for https://mlflow.org/docs/latest/python_api/mlflow.html#end_run
#
import warnings
import mlflow

if __name__ == "__main__":

    warnings.filterwarnings("ignore")
    print(mlflow.__version__)

    # Start run and get status
    mlflow.start_run()
    run = mlflow.active_run()
    print("run_id: {}; status: {}".format(run.info.run_id, run.info.status))

    # End the run and get status
    mlflow.end_run()
    run = mlflow.get_run(run.info.run_id)
    print("run_id: {}; status: {}".format(run.info.run_id, run.info.status))
    print("--")

    # Check for any active runs
    print("Active runs: {}".format(mlflow.active_run()))
Exemple #30
0
def main():
    """
    Получение тематик из текста и сохранение модели
    """
    # Выгрузка топ комменариев
    comments = gc.get_all_comments(**config['comments'])

    comments_clean = pt.get_clean_text(comments,
                                       stopwords.words(config['stopwords']))
    tfidf = TfidfVectorizer(**config['tf_model']).fit(comments_clean)

    # Матрица векторизов комментов и модель
    X_matrix = pt.vectorize_text(comments_clean, tfidf)

    # Разделение текста на тематики при помощи кластеризации, выбор наилучшего числа кластеров
    cluster_labels = cl.get_clusters(X_matrix,
                                     random_state=SEED,
                                     **config['clustering'])

    # Обучение линейной модели на поиска сформированных тематик
    X_train, X_test, y_train, y_test = train_test_split(X_matrix,
                                                        cluster_labels,
                                                        **config['cross_val'],
                                                        random_state=SEED)
    clf_lr = LogisticRegression(**config['model'])

    # MLFlow трэкинг
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment(config['name_experiment'])
    with mlflow.start_run():
        clf_lr.fit(X_train, y_train)

        # Логирование модели и параметров
        mlflow.log_param(
            'f1',
            cl.get_f1_score(y_test, clf_lr.predict(X_test),
                            set(cluster_labels)))
        mlflow.log_param('accuracy',
                         accuracy_score(y_test, clf_lr.predict(X_test)))
        mlflow.log_param(
            'precision',
            cl.get_precision_score(y_test, clf_lr.predict(X_test),
                                   set(cluster_labels)))
        mlflow.sklearn.log_model(
            tfidf,
            artifact_path="vector",
            registered_model_name=f"{config['model_vec']}")
        mlflow.sklearn.log_model(clf_lr,
                                 artifact_path='model_lr',
                                 registered_model_name=f"{config['model_lr']}")
        mlflow.log_artifact(local_path='./train.py', artifact_path='code')
        mlflow.end_run()

    # Получение последней версии модели и сохраннение в файлы
    client = MlflowClient()
    last_version_lr = get_version_model(config['model_lr'], client)
    last_version_vec = get_version_model(config['model_vec'], client)

    yaml_file = yaml.safe_load(open(config_path))
    yaml_file['predict']["version_lr"] = int(last_version_lr)
    yaml_file['predict']["version_vec"] = int(last_version_vec)

    with open(config_path, 'w') as fp:
        yaml.dump(yaml_file, fp, encoding='UTF-8', allow_unicode=True)