def train(self, training_data_t: pd.DataFrame, training_data_f: pd.DataFrame, user_id=None): self.copula_dict = {} self.marg_dict = {} self.kl_dict = {} max_kl_value = -1000 self.kl_dict = {'sum': 0} for score_type in util.DEFAULT_SCORE_TYPE_LIST: marg = marginal.Norm(training_data_t[score_type]) self.marg_dict[score_type] = marg kl = util.kl_divergence_between_population_and_users( marg, score_type) self.kl_dict['sum'] += kl self.kl_dict[score_type] = kl if kl > max_kl_value: max_kl_value = kl self.main_axis = score_type self.kl_dict['sum'] -= max_kl_value main_marg = marginal.Norm(training_data_t[self.main_axis]) main_cdf_list = [ main_marg.cdf(x) for x in training_data_t[self.main_axis] ] for score_type in util.DEFAULT_SCORE_TYPE_LIST: if score_type == self.main_axis: continue marginal_cdf_list_list = [ main_cdf_list, [marg.cdf(x) for x in training_data_t[score_type]] ] cdf_matrix = np.matrix(marginal_cdf_list_list).T self.copula_dict[score_type] = copula.Copula( cdf_matrix, self.cop)
def create_weight_and_score_model_list( hotel_cluster: List[pd.DataFrame], marg_name: str, cop: str, score_type_list: List[str], marg_option=None, ) -> List[Tuple[float, dict]]: filtered_hotel_cluster = [ chunk for chunk in hotel_cluster if len(chunk) > 1 ] total_item_size = sum([len(chunk) for chunk in filtered_hotel_cluster]) weight_and_score_model_list = [] for i, chunk in enumerate(filtered_hotel_cluster): marginal_cdf_list_list = [] scoring_model = {} for score_type in score_type_list: marginal_score_list = chunk[score_type].values marginal_score_model = marginal.factory_marg( marg_name=marg_name, marg_option=marg_option) marginal_score_model.set_param(training_data=marginal_score_list, score_type=score_type) marginal_cdf_list_list.append( [marginal_score_model.cdf(x) for x in marginal_score_list]) scoring_model[score_type] = marginal_score_model cdf_matrix = np.matrix(marginal_cdf_list_list).T # Construct copula copula_model = copula.Copula(cdf_matrix, cop) scoring_model['copula'] = copula_model weight_and_score_model_list.append( (len(chunk) / total_item_size, scoring_model)) return weight_and_score_model_list
def train(self, training_data_t: pd.DataFrame, training_data_f: pd.DataFrame, user_id): marg_dict = {} kl_dict = {} max_kl_value = 0 for score_type in util.DEFAULT_SCORE_TYPE_LIST: marg = marginal.Norm(training_data_t[score_type]) marg_dict[score_type] = marg kl = util.kl_divergence_between_population_and_users( marg, score_type) kl_dict[score_type] = kl if kl > max_kl_value: max_kl_value = kl self.main_axis = score_type self.marg_dict = marg_dict print(kl_dict.values()) new_kl_dict = {k: v for k, v in kl_dict.items() if v > 0.05} self.score_type_list = [ k for k, v in sorted(new_kl_dict.items(), key=lambda x: x[1]) ] if len(self.score_type_list) == 1: self.score_type_list = util.DEFAULT_SCORE_TYPE_LIST else: kl_dict = new_kl_dict #todo remove for score_type in self.score_type_list: print(score_type, kl_dict[score_type]) #todo remove nested_copula_list = [] for i in range(0, len(self.score_type_list) - 1): former_kl = kl_dict[self.score_type_list[i]] current_kl = kl_dict[self.score_type_list[i + 1]] nested_copula_list.append( copula.Copula(np.matrix([]), 'gumbel', param=1 + self.param_a * np.log1p(current_kl), dim=2)) self.nested_copula_list = nested_copula_list
def train(self, training_data_t: pd.DataFrame, training_data_f: pd.DataFrame, user_id=None): self.copula_dict = {} self.marg_dict = {} self.kl_dict = {} max_kl_value = -1000 self.kl_dict = {'sum': 0} score_type_list = [] for score_type in util.DEFAULT_SCORE_TYPE_LIST: marg = marginal.Norm(training_data_t[score_type]) self.marg_dict[score_type] = marg kl = util.kl_divergence_between_population_and_users( marg, score_type) print(score_type, kl) self.kl_dict['sum'] += kl self.kl_dict[score_type] = kl if kl > max_kl_value: max_kl_value = kl self.main_axis = score_type print(self.main_axis) self.kl_dict['sum'] -= max_kl_value main_marg = marginal.Norm(training_data_t[self.main_axis]) main_cdf_list = [ main_marg.cdf(x) for x in training_data_t[self.main_axis] ] for score_type in self.score_type_list: if score_type == self.main_axis: continue target = [self.main_axis, score_type] clust = models.create_cluster(training_data_t, self.n_clusters, target) self.copula_dict[ score_type] = models.create_weight_and_scoring_model_list( clust, self.marg, self.cop, target, [score_type], []) each_copula_cdf_list_list = [] each_copula_cdf_list_list2 = [] each_copula_cdf_list_list3 = [] for score_type in self.score_type_list: if score_type == self.main_axis: continue each_copula_cdf_list_list.append([]) each_copula_cdf_list_list2.append([]) each_copula_cdf_list_list3.append([]) for index, row in training_data_t.iterrows(): #main_cdf = self.marg_dict[self.main_axis].cdf(row[self.main_axis]) cnt = 0 for score_type in self.score_type_list: if score_type == self.main_axis: continue main_cdf = 0 sub_cdf = 0 cop_cdf = 0 for weight_and_scoring_model in self.copula_dict[score_type]: weight = weight_and_scoring_model[0] score_model = weight_and_scoring_model[1] marginal_cdf_list = [] for axis in [self.main_axis, score_type]: marginal_score_model = score_model[axis] marg_cdf = marginal_score_model.cdf(row[axis]) marginal_cdf_list.append(marg_cdf) if axis != self.main_axis: sub_cdf += weight * marg_cdf else: main_cdf += weight * marg_cdf cop_cdf += score_model['copula'].cdf( np.matrix(marginal_cdf_list)) * weight each_copula_cdf_list_list[cnt].append(cop_cdf) each_copula_cdf_list_list2[cnt].append(cop_cdf * sub_cdf) each_copula_cdf_list_list3[cnt].append(cop_cdf * main_cdf * sub_cdf) cnt += 1 cop_mat1 = np.matrix(each_copula_cdf_list_list).T cop_mat2 = np.matrix(each_copula_cdf_list_list2).T cop_mat3 = np.matrix(each_copula_cdf_list_list3).T if self.indep_copulaed: pass else: self.top_copula1 = copula.Copula(cop_mat1, self.cop) self.top_copula2 = copula.Copula(cop_mat2, self.cop) self.top_copula3 = copula.Copula(cop_mat3, self.cop)