def calc_client_prop(self, show_progress=False, user_info_koeff=0, squared=True, set_already_bough_prop=False): client_prop = np.zeros((self.n_Kunden, self.n_Produkte)) print("Calculate Propabilities:", "." * 100) if user_info_koeff != 0: user_info = get_info() user_info_matrix = ( user_info.dot(user_info.T) / 5 ) # 5 ist die anzahl der one hot encodeten features for kunden_index in range(self.n_Kunden): if show_progress: load.print_progress(kunden_index / self.n_Kunden, "Calculate Propabilities") kunden_vektor = self.KPM[kunden_index] kunden_buy_list = np.argwhere(kunden_vektor == 1)[:, 0] for produkt_index in range(self.n_Produkte): if produkt_index in kunden_buy_list and set_already_bough_prop: client_prop[kunden_index, produkt_index] = set_already_bough_prop else: P_x = sum( np.sum(self.KPM[:, kunden_buy_list], axis=1) / len(kunden_buy_list)**2) / self.n_Kunden P_y = self.occ[produkt_index] #P_x_if_y = sum( # np.sum(self.KPM[np.argwhere(self.KPM[:, produkt_index] == 1)[:, 0]][:, kunden_buy_list], # axis=1) / len(kunden_buy_list) ** 2) / self.n_Kunden if squared: P_x_if_y = np.sum( self.KPM[np.argwhere(self.KPM[:, produkt_index] == 1)[:, 0]][:, kunden_buy_list], axis=1) / len(kunden_buy_list)**2 else: P_x_if_y = np.sum( self.KPM[np.argwhere(self.KPM[:, produkt_index] == 1)[:, 0]][:, kunden_buy_list], axis=1) / len(kunden_buy_list) if user_info_koeff != 0: P_x_if_y = ( 1 - user_info_koeff ) * P_x_if_y + user_info_koeff * P_x_if_y * user_info_matrix[ kunden_index, np.argwhere(self.KPM[:, produkt_index] == 1)[:, 0]] P_x_if_y = sum(P_x_if_y) / self.n_Kunden client_prop[kunden_index, produkt_index] = P_x_if_y * P_y / P_x self.client_prop = client_prop
def export_as_csv_in_tableau_format(self, pred_set, predictions): n_Kunden, n_Produkte = predictions.shape dict = { "client": [], "content": [], "propability": [], "already_bought": [] } for k in range(n_Kunden): if self.show_progress: load.print_progress(k / n_Kunden, "export") for p in range(n_Produkte): if self.split == "clients": dict["client"].append( self.client_split_dict[pred_set]["indexes"][k]) dict["already_bought"].append( self.client_split_dict[pred_set]["KPM"][k, p]) else: dict["client"].append(k) dict["already_bought"].append(self.KPM_dict[pred_set][k, p]) dict["content"].append(p) dict["propability"].append(predictions[k, p]) title = self.dataset + "_predictions_" + "fit" + self.fit_set + "_pred" + pred_set + "_" + self.model_type + "_approach" + str( self.approach) + "_split" + self.split + "_info" + str( self.use_user_info) + self.info_string pd.DataFrame(dict).to_csv("Tableau_exports/" + title + ".csv", index_label="Row_index", sep=";")
def train_batches(self, model, n_batches): for batch_index in range(n_batches): if self.show_progress: load.print_progress(batch_index / n_batches, "train_batches") batch_data = np.load(self.dataset + "/batches/data_batch_no_" + str(batch_index) + ".npy") batch_target = np.load(self.dataset + "/batches/target_batch_no_" + str(batch_index) + ".npy") model.partial_fit(batch_data, batch_target, classes=np.arange(self.n_Produkte)) return model
def get_occ_matricies(self, show_progress=False): if_occurence = np.zeros((self.n_Produkte, self.n_Produkte)) occurence = np.zeros(self.n_Produkte) print("get_occ_matricies:", "." * 100) for row in range(self.n_Produkte): occurence[row] = sum(self.KPM[:, row]) / self.n_Kunden if show_progress: load.print_progress(row / self.n_Produkte, "get_occ_matricies") for col in range(row + 1): p_row = occurence[row] p_col = occurence[col] p_row_and_col = self.KPM[:, row].dot( self.KPM[:, col]) / self.n_Kunden # if_occurence[row,col]=P(row|col) if_occurence[row, col] = p_row_and_col / p_col if p_col != 0 else 0 if_occurence[col, row] = p_row_and_col / p_row if p_row != 0 else 0 self.if_occ = if_occurence self.occ = occurence return if_occurence, occurence
def predict_set_approach_1(self, set="test"): if self.split == "clients": kunden_vektor = self.client_split_dict[set]["KPM"] else: kunden_vektor = self.KPM_dict[set] if self.use_user_info: if self.split == "clients": kunden_vektor = np.hstack( (kunden_vektor, self.client_split_dict[set]["info"])) else: kunden_vektor = np.hstack((kunden_vektor, self.info_dict[set])) prediction = np.zeros_like(kunden_vektor) for index in range(self.n_Produkte): if self.show_progress: load.print_progress(index / self.n_Produkte, "Prediction") prediction[:, index] = self.model_list[index].predict_proba( np.delete(kunden_vektor, index, axis=1))[:, 1] return prediction
def make_model_approach_1(self, set="full"): if self.use_user_info: if self.split == "clients": user_info = self.client_split_dict[set]["info"] else: user_info = self.info_dict[set] n_user_features = len(user_info[0]) else: n_user_features = 0 if self.split == "clients": KPM = self.client_split_dict[set]["KPM"] else: KPM = self.KPM_dict[set] self.model_list = [] for prod_n in range(self.n_Produkte): if self.show_progress: load.print_progress(prod_n / self.n_Produkte, "make_model") target = KPM[:, prod_n] data = np.delete(KPM, prod_n, axis=1) if self.use_user_info: data = np.hstack((data, user_info)) if self.model_type == "multinomial": model = MultinomialNB() elif self.model_type == "bernoulli": model = BernoulliNB() elif self.model_type == "complement": model = ComplementNB() elif self.model_type == "gaussian": model = GaussianNB() model.fit(data, target) self.model_list.append(model)
def save_batches(self, data, target): n_batches = np.round(len(data) / self.batch_size + 0.5).astype(int) # save batches for batch_index in range(n_batches): if self.show_progress: load.print_progress(batch_index / n_batches, "save_batches") if batch_index == n_batches - 1: batch_data = data[batch_index * self.batch_size:] batch_target = target[batch_index * self.batch_size:] else: batch_data = data[batch_index * self.batch_size:(batch_index + 1) * self.batch_size] batch_target = target[batch_index * self.batch_size:(batch_index + 1) * self.batch_size] np.save( self.dataset + "/batches/data_batch_no_" + str(batch_index) + ".npy", batch_data) np.save( self.dataset + "/batches/target_batch_no_" + str(batch_index) + ".npy", batch_target) return n_batches
def make_model_approach_2(self, set="full"): if self.use_user_info: if self.split == "clients": user_info = self.client_split_dict[set]["info"] else: user_info = self.info_dict[set] n_user_features = len(user_info[0]) else: n_user_features = 0 if self.split == "clients": KPM = self.client_split_dict[set]["KPM"] else: KPM = self.KPM_dict[set] n_k, n_p = KPM.shape target = [] data = [] # target = [i for i in range(self.n_Produkte)] # data = [[0 for i in range(self.n_Produkte+n_user_features)] for i in range(self.n_Produkte)] for kunden_index in range(n_k): if self.show_progress: load.print_progress(kunden_index / n_k, "prepare data") for produkt_index in np.argwhere( KPM[kunden_index] == 1)[:, 0]: # self.KPM_dict[set][kunden_index] == 1)[:,0]: target.append(produkt_index) var_Kunde = np.array( KPM[kunden_index]) # self.KPM_dict[set][kunden_index]) var_Kunde[produkt_index] = 0 if self.use_user_info: var_Kunde = np.hstack((var_Kunde, user_info[kunden_index])) data.append(var_Kunde) print("target", len(target)) # target.shape) print("data", len(data)) # data.shape) if self.model_type == "multinomial": model = MultinomialNB() elif self.model_type == "bernoulli": model = BernoulliNB() elif self.model_type == "complement": model = ComplementNB() elif self.model_type == "gaussian": model = GaussianNB() print("a") if self.batch_learning: n_batches = self.save_batches(data, target) print("############################", n_batches, " ########################################") # del data # del target # del var_Kunde # del KPM # gc.collect() print("now_train") model = self.train_batches(model, n_batches) else: model.fit(data, target) print("b") self.model = model