def fit_and_predict(self, KPM, user_info=0, method="Naive"):
        n_Kunden, n_Produkte = np.shape(KPM)
        prod_occ = self.get_prod_occ(KPM, n_Kunden)
        predictions = np.zeros_like(KPM)
        for kunden_index in range(n_Kunden):
            for kunden_index in range(n_Kunden):
                if kunden_index == 0:
                    load = loader(full=n_Kunden, message="predict")
                load.print_progress(kunden_index)
            kunden_vektor = KPM[kunden_index]
            kunden_buy_list = np.argwhere(kunden_vektor == 1)[:, 0]
            for prod_index in range(n_Produkte):
                P_y = prod_occ[prod_index]

                if method == "Naive":
                    P_x = 1
                    for index in kunden_buy_list:
                        P_x *= prod_occ[index]
                elif method == "Approx":
                    P_x = np.sum(KPM[:, kunden_buy_list],
                                 axis=None) / (len(kunden_buy_list) * n_Kunden)
                elif method == "Squared":
                    P_x = sum((np.sum(KPM[:, kunden_buy_list], axis=1) /
                               len(kunden_buy_list))**2) / n_Kunden
                elif method == "Empirical":
                    P_x = len(
                        np.argwhere((np.sum(KPM[:, kunden_buy_list], axis=1) /
                                     len(kunden_buy_list)) == 1)) / n_Kunden

                item_buy_list = np.argwhere(KPM[:, prod_index] == 1)[:, 0]
                reduced_KPM = KPM[item_buy_list]
                n_reduced_Kunden = len(reduced_KPM)
                reduced_prod_occ = self.get_prod_occ(reduced_KPM,
                                                     n_reduced_Kunden)
                if method == "Naive":
                    P_x_if_y = 1
                    for index in kunden_buy_list:
                        P_x_if_y *= reduced_prod_occ[index]
                elif method == "Approx":
                    P_x_if_y = np.sum(
                        reduced_KPM[:, kunden_buy_list],
                        axis=None) / (len(kunden_buy_list) * n_reduced_Kunden)
                elif method == "Squared":
                    P_x_if_y = sum(
                        (np.sum(reduced_KPM[:, kunden_buy_list], axis=1) /
                         len(kunden_buy_list))**2) / n_reduced_Kunden
                elif method == "Empirical":
                    P_x_if_y = len(
                        np.argwhere(
                            (np.sum(reduced_KPM[:, kunden_buy_list], axis=1) /
                             len(kunden_buy_list)) == 1)) / n_reduced_Kunden
                # if type(P_x_if_y * P_y / P_x) != float:
                #     print(type(P_x_if_y * P_y / P_x))
                #     print(P_x_if_y,P_y,P_x)
                predictions[
                    kunden_index,
                    prod_index] = P_x_if_y * P_y / P_x if P_x != 0 else 0
        return predictions
    def predict(self, ds=0):
        if self.config["approach"] == "multi":
            x = ds.get_value("KPM", self.config["pred_set"])

            if self.config["use_user_info"]:
                x = np.hstack((x, ds.get_value("info",
                                               self.config["pred_set"])))

            title = self.config["dataset"] + "_model"
            model = self.load_model(title)

            if self.config["model"] == "DeepLearning":
                prediction = model.predict(x)
            elif self.config["model"] == "NaiveBayes":
                prediction = model.predict_proba(x)

        elif self.config["approach"] == "binary":
            KPM = ds.get_value("KPM", self.config["pred_set"])
            prediction = np.zeros_like(KPM)

            if self.config["use_user_info"]:
                KPM = np.hstack(
                    (KPM, ds.get_value("info", self.config["pred_set"])))

            for index in range(self.config["n_Produkte"]):
                if self.config["show_progress"]:
                    if index == 0:
                        load = loader(self.config["n_Produkte"], "predict")
                    load.print_progress(index)

                x = np.delete(KPM, index, axis=1)
                title = self.config["dataset"] + "_model_no_" + str(index)
                model = self.load_model(title)

                if self.config["model"] == "DeepLearning":
                    prediction[:, index] = model.predict(x)[:, 1]
                elif self.config["model"] == "NaiveBayes":
                    prediction[:, index] = model.predict_proba(x)[:, 1]

        # title = self.config["dataset"] + "_predictions_" + "fit" + self.config["fit_set"] + \
        #         "_pred" + self.config["pred_set"] + "_" + self.config["NaiveBayes"]["model_type"] + \
        #         "_approach" + str(self.config["approach"]) + "_split" + self.config["split"] + \
        #         "_count" + str(self.config["count"]) + \
        #         "_info" + str(self.config["use_user_info"]) + self.config["info_string"]

        title = self.config["model_name"] + "_predictions"
        np.save(self.config["dataset"] + "/npy_files/" + title, prediction)
        self.config["n_pred_batches"] = ds.save_batches(
            data=[prediction],
            names=["prediction"],
            batch_size=self.config["pred_batch_size"])

        with open(self.config["dataset"] + "/json_files/config.json",
                  "w") as fp:
            json.dump(self.config, fp, indent=5)
Beispiel #3
0
    def save_batches(self, data=[], names=[], batch_size=100):
        """
        save a list of data into batches
        :param data: list of data
        :param names: list of names how the batches of each data in the datalist will be called
               dir: self.config["dataset"] + "/batches/"+names[index]+"_batch_no_" + str(batch_index) + ".npy"
        :param batch_size: size of the batches
        :return: number of batches
        """
        N = len(data)

        len_check = len(data[0])
        for dat in data[1:]:
            if not len_check == len(dat):
                print(
                    "shapes dont fit for method save_batches. data should be a tuple of iterables of the same length "
                )
                sys.exit(0)

        n_batches = round(len_check / batch_size + 0.5)
        for batch_index in range(n_batches):

            if self.config["show_progress"]:
                if batch_index == 0:
                    load = loader(n_batches, "save_batches")
                load.print_progress(batch_index)
            if batch_index == n_batches - 1:
                batch_data = [dat[batch_index * batch_size:] for dat in data]
            else:
                batch_data = [
                    dat[batch_index * batch_size:(batch_index + 1) *
                        batch_size] for dat in data
                ]
            for index in range(N):
                np.save(
                    self.config["dataset"] + "/batches/" + names[index] +
                    "_batch_no_" + str(batch_index) + ".npy",
                    batch_data[index])

        return n_batches
def export_as_csv_in_tableau_format(data_server, config={}):
    db_config = {
        "database": {
            "host": "192.168.178.14",
            "user": "******",
            "passwd": "$moothOperat0r",
            "database": "dsaas"
        },
        "schema_name":
        "dbt_recommender_system_predictions",
        "table_name":
        "tb_b_recommendations_" + config["model_name"],
        "key_list":
        ['run_id', 'client', 'content', 'propability', 'already_bought'],
        "dtype_list": ["int8", "int", "int", "float", "int"],
        "primary_key":
        None,
        "auto_increment":
        None
    }
    run_id = to_integer(datetime.datetime.now())
    if table_exists(db_config) == 1:
        delete_table(db_config)
    create_table(db_config)

    for batch_index in range(config["n_pred_batches"]):

        if config["show_progress"] and config["n_pred_batches"] != 1:
            if batch_index == 0:
                load = loader(config["n_pred_batches"], "export")
            load.print_progress(batch_index)

        predictions = np.load(config["dataset"] +
                              "/batches/prediction_batch_no_" +
                              str(batch_index) + ".npy")
        KPM = np.load(config["dataset"] + "/batches/KPM_batch_no_" +
                      str(batch_index) + ".npy")
        indexes = np.load(config["dataset"] + "/batches/indexes_batch_no_" +
                          str(batch_index) + ".npy")
        if len(predictions
               ) < config["pred_batch_size"] and batch_index != config[
                   "n_pred_batches"] - 1:
            print("""
                    prediction batch size ("pred_batch_size" = %d) is too high.
                    change value in config file.
                    For now it has been set to the length of the prediction.
                    """ % config["pred_batch_size"])
            config["pred_batch_size"] = len(predictions)

        n_Kunden, n_Produkte = predictions.shape
        dict = {
            "client": [],
            "content": [],
            "propability": [],
            "already_bought": []
        }
        for k in range(n_Kunden):

            if config["show_progress"] and config["n_pred_batches"] == 1:
                if k == 0:
                    load = loader(
                        n_Kunden, "export batch: " + str(batch_index + 1) +
                        " von" + str(config["n_pred_batches"]))
                load.print_progress(k)

            for p in range(n_Produkte):
                if config["split"] == "clients":
                    dict["client"].append(indexes[k])
                    dict["already_bought"].append(KPM[k, p])
                else:
                    dict["client"].append(k)
                    dict["already_bought"].append(KPM[k, p])
                dict["content"].append(p)
                dict["propability"].append(predictions[k, p])

        # title = config["dataset"] + "_predictions_batch_no_" + str(batch_index) + \
        #             "_fit" + config["fit_set"] + "_pred" + config["pred_set"] + "_" + \
        #             config["NaiveBayes"]["model_type"] + "_approach" + str(config["approach"]) + \
        #             "_split" + config["split"] + "_info" + str(config["use_user_info"]) + config["info_string"]
        title = config["model_name"] + "_predictions"
        df = pd.DataFrame(dict)

        if config["save_result_as_csv"]:
            df.to_csv("Tableau_exports/" + title + ".csv",
                      index_label="Row_index",
                      sep=";")
        if config["save_result_to_db"]:
            r, c = df.values.shape
            values = np.hstack((np.ones((r, 1)) * run_id, df.values))
            insert_to_table(db_config, values, config["show_progress"])
    def predict(self, test_KPM, method):
        predictions = np.zeros_like(test_KPM)
        n_test_Kunden, n_Produkte = test_KPM.shape
        for kunden_index in range(n_test_Kunden):
            if kunden_index == 0:
                load = loader(full=n_test_Kunden, message="predict")
            load.print_progress(kunden_index)
            #load.print_progress(kunden_index, n_test_Kunden, "predict")

            kunden_vektor = test_KPM[kunden_index]
            kunden_buy_list = np.argwhere(kunden_vektor == 1)[:, 0]
            for prod_index in range(n_Produkte):
                P_y = self.prod_occ[prod_index]
                if method == "Naive":
                    P_x = 1
                    for index in kunden_buy_list:
                        P_x *= self.prod_occ[index]
                elif method == "Approx":
                    P_x = np.sum(
                        self.train_KPM[:, kunden_buy_list],
                        axis=None) / (len(kunden_buy_list) * self.n_Kunden)
                elif method == "Squared":
                    P_x = sum(
                        (np.sum(self.train_KPM[:, kunden_buy_list], axis=1) /
                         len(kunden_buy_list))**2) / self.n_Kunden
                elif method == "Empirical":
                    P_x = len(
                        np.argwhere(
                            (np.sum(self.train_KPM[:,
                                                   kunden_buy_list], axis=1) /
                             len(kunden_buy_list)) == 1)) / self.n_Kunden

                item_buy_list = np.argwhere(self.train_KPM[:,
                                                           prod_index] == 1)[:,
                                                                             0]
                reduced_KPM = self.train_KPM[item_buy_list]
                n_reduced_Kunden = len(reduced_KPM)
                reduced_prod_occ = self.get_prod_occ(reduced_KPM,
                                                     n_reduced_Kunden)
                if n_reduced_Kunden == 0:
                    predictions[kunden_index, prod_index] = 0
                else:
                    if method == "Naive":
                        P_x_if_y = 1
                        for index in kunden_buy_list:
                            P_x_if_y *= reduced_prod_occ[index]
                    elif method == "Approx":
                        P_x_if_y = np.sum(reduced_KPM[:, kunden_buy_list],
                                          axis=None) / (len(kunden_buy_list) *
                                                        n_reduced_Kunden)
                    elif method == "Squared":
                        P_x_if_y = sum(
                            (np.sum(reduced_KPM[:, kunden_buy_list], axis=1) /
                             len(kunden_buy_list))**2) / n_reduced_Kunden
                    elif method == "Empirical":
                        P_x_if_y = len(
                            np.argwhere((
                                np.sum(reduced_KPM[:,
                                                   kunden_buy_list], axis=1) /
                                len(kunden_buy_list)) == 1)) / n_reduced_Kunden
                    predictions[
                        kunden_index,
                        prod_index] = P_x_if_y * P_y / P_x if P_x != 0 else 0

        return predictions
def eval(dataset,
         prediction_filename,
         split,
         set,
         threshold=0.5,
         top_n_test=True,
         top_n=20):

    index = 0
    min_dist = 1

    if split == "clients":
        indexes = np.load(dataset + "/npy_files/" + set + "_index.npy")
        KPM = np.load(dataset + "/npy_files/full_KPM.npy")[indexes] == 1
    elif split == "orders":
        KPM = np.load(dataset + "/npy_files/" + set + "_KPM.npy") == 1

    n_Kunden, n_Produkte = KPM.shape

    print("Kunden", n_Kunden)
    print("Produkte", n_Produkte)
    print("Interaktionen", np.sum(np.sum(KPM)))

    if threshold < 0:
        threshold = 1 / n_Produkte

    predictions = np.load(dataset + "/npy_files/" + prediction_filename)
    calssifications = predictions.flatten() > threshold

    if top_n_test:
        n_orders = np.sum(KPM, axis=None)
        n_hits = 0
        for client_index in range(n_Kunden):
            if client_index == 0:
                load = loader(len(predictions), "evaluation")
            load.print_progress(client_index)
            bought_items = np.argwhere(KPM[client_index] == 1)[:, 0]
            for item_index in bought_items:
                if item_index in np.array(
                        sorted(zip(predictions[client_index],
                                   np.arange(len(predictions[client_index]))),
                               reverse=True))[:, 1][:top_n]:
                    n_hits += 1
        score = n_hits / n_orders
        print(
            str(score * 100) + "%\t(", n_hits, "von", n_orders,
            ") \tder getätigten käufte sind in der top", top_n,
            "der Produktempfehlungen")

    KPM = KPM.flatten()
    print("Kpm flattened")
    #fpr, tpr, thresholds = metrics.roc_curve(KPM, predictions.flatten())

    print(prediction_filename + ":")
    print("MSE", metrics.mean_squared_error(KPM, predictions.flatten()))
    print("neg_log_loss", metrics.log_loss(KPM, predictions.flatten()))
    print("Accuracy", metrics.accuracy_score(KPM, calssifications))
    print("Precision", metrics.precision_score(KPM, calssifications))
    print("Recall", metrics.recall_score(KPM, calssifications))
    print("F1", metrics.f1_score(KPM, calssifications))

    print("Confusion Matrix (tn,fp,fn,tp)")
    print(metrics.confusion_matrix(KPM, calssifications))
Beispiel #7
0
    def get_training_data(self, train_set):
        """
        generatie training data based on given configuration
        :param train_set:
        :return:
        """
        if self.config["use_user_info"]:
            user_info = self.get_value("info", train_set)
            self.config["n_info_cols"] = len(user_info[0])

        KPM = self.get_value("KPM", train_set)
        n_k, n_p = KPM.shape

        if self.config["approach"] == "multi":
            target = [i for i in range(self.config["n_Produkte"])]
            data = [[
                0 for i in range(self.config["n_Produkte"] +
                                 self.config["n_info_cols"])
            ] for i in range(self.config["n_Produkte"])]

            for kunden_index in range(n_k):
                if self.config["show_progress"]:
                    if kunden_index == 0:
                        load = loader(n_k, "save_batches")
                    load.print_progress(kunden_index)

                for produkt_index in np.argwhere(KPM[kunden_index] > 0)[:, 0]:
                    target.append(produkt_index)
                    var_Kunde = np.array(KPM[kunden_index])
                    var_Kunde[produkt_index] = 0
                    if self.config["use_user_info"]:
                        var_Kunde = np.hstack(
                            (var_Kunde, user_info[kunden_index]))
                    data.append(var_Kunde)

            self.config["n_train_batches"] = self.save_batches(
                data=(data, target),
                names=["data", "target"],
                batch_size=self.config["train_batch_size"])

        elif self.config["approach"] == "binary":
            for prod_n in range(self.config["n_Produkte"]):
                if self.config["show_progress"]:
                    if prod_n == 0:
                        load = loader(full=self.config["n_Produkte"],
                                      message="save_batches")
                    load.print_progress(prod_n)

                target = KPM[:, prod_n]
                data = np.delete(KPM, prod_n, axis=1)
                if self.config["use_user_info"]:
                    data = np.hstack((data, user_info))

                show_progress = self.config["show_progress"]
                self.config["show_progress"] = False
                self.config["n_train_batches"] = self.save_batches(
                    data=[data, target],
                    names=[
                        "data_model_" + str(prod_n),
                        "target_model_" + str(prod_n)
                    ],
                    batch_size=self.config["train_batch_size"])

                self.config["show_progress"] = show_progress
    def fit(self, names=["data", "target"], ds=0):

        if self.config["approach"] == "multi":
            model = self.get_model()
            classes = np.arange(self.config["n_Produkte"])
            if self.config["n_train_batches"]:
                for batch_index in range(self.config["n_train_batches"]):
                    if self.config["show_progress"]:
                        if batch_index == 0:
                            load = loader(self.config["n_train_batches"],
                                          "train_batches")
                        load.print_progress(batch_index)

                    x = np.load(self.config["dataset"] + "/batches/" +
                                names[0] + "_batch_no_" + str(batch_index) +
                                ".npy")
                    t = np.load(self.config["dataset"] + "/batches/" +
                                names[1] + "_batch_no_" + str(batch_index) +
                                ".npy")

                    if self.config["model"] == "DeepLearning":
                        T = np.zeros((len(t), self.config["n_Produkte"]))
                        for row, col in zip(np.arange(len(t)), t):
                            T[row, col] = 1
                        model.fit(
                            x,
                            T,
                            epochs=self.config["DeepLearning"]["n_epochs"],
                            verbose=True)
                    else:
                        if self.config["n_train_batches"] == 1:
                            model.fit(x, t)
                        else:
                            model.partial_fit(x, t, classes)

            title = self.config["dataset"] + "_model"
            self.save_model(title, model)

        elif self.config["approach"] == "binary":
            for index in range(self.config["n_Produkte"]):
                model = self.get_model()
                classes = np.array([0, 1])

                if self.config["show_progress"]:
                    if index == 0:
                        load = loader(self.config["n_Produkte"], "train")
                    load.print_progress(index)

                KPM = ds.get_value("KPM", self.config["fit_set"])

                if self.config["use_user_info"]:
                    KPM = np.hstack(
                        (KPM, ds.get_value("info", self.config["fit_set"])))

                x = np.delete(KPM, index, axis=1)
                t = KPM[:, index]

                if self.config["model"] == "DeepLearning":
                    T = np.zeros((len(t), 2))
                    for row, col in zip(np.arange(len(t)), t):
                        T[row, int(col)] = 1
                    model.fit(x,
                              T,
                              epochs=self.config["DeepLearning"]["n_epochs"],
                              verbose=True)
                elif self.config["model"] == "NaiveBayes":
                    model.fit(x, t)

                title = self.config["dataset"] + "_model_no_" + str(index)
                self.save_model(title, model)