def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none", **similarity_args): self.topK = topK self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = okapi_BM_25(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') elif feature_weighting == "TF-IDF": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = TF_IDF(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') similarity = Compute_Similarity(self.URM_train.T, shrink=shrink, topK=topK, normalize=normalize, similarity=similarity, **similarity_args) self.W_sparse = similarity.compute_similarity() self.W_sparse = check_matrix(self.W_sparse, format='csr')
def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none", **similarity_args): self.topK = topK self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.ICM = self.ICM.astype(np.float32) self.ICM = okapi_BM_25(self.ICM) elif feature_weighting == "TF-IDF": self.ICM = self.ICM.astype(np.float32) self.ICM = TF_IDF(self.ICM) similarity = Compute_Similarity(self.ICM.T, shrink=shrink, topK=topK, normalize=normalize, similarity=similarity, **similarity_args) if self.sparse_weights: self.W_sparse = similarity.compute_similarity() else: self.W = similarity.compute_similarity() self.W = self.W.toarray() return self.W_sparse
def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, force_compute_sim=True, feature_weighting="none", feature_weighting_index=0, **similarity_args): self.feature_weighting_index = feature_weighting_index feature_weighting = self.FEATURE_WEIGHTING_VALUES[ feature_weighting_index] self.topK = topK self.shrink = shrink if not force_compute_sim: found = True try: with open( os.path.join( "IntermediateComputations", "ICB", "tot={}_topK={}_shrink={}_featureweight={}.pkl". format(str(len(self.URM_train.data)), str(self.topK), str(self.shrink), str(self.feature_weighting_index))), 'rb') as handle: (topK_new, shrink_new, W_sparse_new) = pickle.load(handle) except FileNotFoundError: print("File {} not found".format( os.path.join("IntermediateComputations", "ContentBFMatrix.pkl"))) found = False if found and self.topK == topK_new and self.shrink == shrink_new: self.W_sparse = W_sparse_new print("Saved CBF Similarity Matrix Used!") return if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.ICM = self.ICM.astype(np.float32) self.ICM = okapi_BM_25(self.ICM) elif feature_weighting == "TF-IDF": self.ICM = self.ICM.astype(np.float32) self.ICM = TF_IDF(self.ICM) similarity = Compute_Similarity(self.ICM.T, shrink=shrink, topK=topK, normalize=normalize, similarity=similarity, **similarity_args) if self.sparse_weights: self.W_sparse = similarity.compute_similarity() with open( os.path.join( "IntermediateComputations", "ICB", "tot={}_topK={}_shrink={}_featureweight={}.pkl".format( str(len(self.URM_train.data)), str(self.topK), str(self.shrink), str(self.feature_weighting_index))), 'wb') as handle: pickle.dump((self.topK, self.shrink, self.W_sparse), handle, protocol=pickle.HIGHEST_PROTOCOL) print("CBF similarity matrix saved") else: self.W = similarity.compute_similarity() self.W = self.W.toarray()
def fit(self, validation_every_n=5, show_max_performance=False, logFile=None, precompute_common_features=True, learning_rate=0.01, positive_only_weights=True, init_type="zero", normalize_similarity=False, use_dropout=True, dropout_perc=0.3, l1_reg=0.0, l2_reg=0.0, epochs=50, topK=300, add_zeros_quota=0.0, sgd_mode='adagrad', gamma=0.9, beta_1=0.9, beta_2=0.999, stop_on_validation=False, lower_validatons_allowed=5, validation_metric="MAP", evaluator_object=None): if init_type not in self.INIT_TYPE_VALUES: raise ValueError( "Value for 'init_type' not recognized. Acceptable values are {}, provided was '{}'" .format(self.INIT_TYPE_VALUES, init_type)) # Import compiled module from FW_Similarity.Cython.CFW_D_Similarity_Cython_SGD import CFW_D_Similarity_Cython_SGD self.logFile = logFile if validation_every_n is not None: self.validation_every_n = validation_every_n else: self.validation_every_n = np.inf self.evaluator_object = evaluator_object self.show_max_performance = show_max_performance self.positive_only_weights = positive_only_weights self.normalize_similarity = normalize_similarity self.learning_rate = learning_rate self.add_zeros_quota = add_zeros_quota self.l1_reg = l1_reg self.l2_reg = l2_reg self.epochs = epochs self.topK = topK self.generateTrainData_low_ram() weights_initialization = None if init_type == "random": weights_initialization = np.random.normal( 0.001, 0.1, self.n_features).astype(np.float64) elif init_type == "one": weights_initialization = np.ones(self.n_features, dtype=np.float64) elif init_type == "zero": weights_initialization = np.zeros(self.n_features, dtype=np.float64) elif init_type == "BM25": weights_initialization = np.ones(self.n_features, dtype=np.float64) self.ICM = self.ICM.astype(np.float32) self.ICM = okapi_BM_25(self.ICM) elif init_type == "TF-IDF": weights_initialization = np.ones(self.n_features, dtype=np.float64) self.ICM = self.ICM.astype(np.float32) self.ICM = TF_IDF(self.ICM) else: raise ValueError( "CFW_D_Similarity_Cython: 'init_type' not recognized") # Instantiate fast Cython implementation self.FW_D_Similarity = CFW_D_Similarity_Cython_SGD( self.row_list, self.col_list, self.data_list, self.n_features, self.ICM, precompute_common_features=precompute_common_features, non_negative_weights=self.positive_only_weights, weights_initialization=weights_initialization, use_dropout=use_dropout, dropout_perc=dropout_perc, learning_rate=learning_rate, l1_reg=l1_reg, l2_reg=l2_reg, sgd_mode=sgd_mode, gamma=gamma, beta_1=beta_1, beta_2=beta_2) print(self.RECOMMENDER_NAME + ": Initialization completed") self._train_with_early_stopping(epochs, validation_every_n, stop_on_validation, validation_metric, lower_validatons_allowed, evaluator_object, algorithm_name=self.RECOMMENDER_NAME) self.compute_W_sparse() sys.stdout.flush()
def fit(self, show_max_performance=False, precompute_common_features=False, learning_rate=0.1, positive_only_D=True, initialization_mode_D="random", normalize_similarity=False, use_dropout=True, dropout_perc=0.3, l1_reg=0.0, l2_reg=0.0, epochs=50, topK=300, add_zeros_quota=0.0, log_file=None, verbose=False, sgd_mode='adagrad', gamma=0.9, beta_1=0.9, beta_2=0.999, **earlystopping_kwargs): if initialization_mode_D not in self.INIT_TYPE_VALUES: raise ValueError( "Value for 'initialization_mode_D' not recognized. Acceptable values are {}, provided was '{}'" .format(self.INIT_TYPE_VALUES, initialization_mode_D)) # Import compiled module from FeatureWeighting.Cython.CFW_D_Similarity_Cython_SGD import CFW_D_Similarity_Cython_SGD self.show_max_performance = show_max_performance self.normalize_similarity = normalize_similarity self.learning_rate = learning_rate self.add_zeros_quota = add_zeros_quota self.l1_reg = l1_reg self.l2_reg = l2_reg self.epochs = epochs self.topK = topK self.log_file = log_file self.verbose = verbose self._generate_train_data() weights_initialization_D = None if initialization_mode_D == "random": weights_initialization_D = np.random.normal( 0.001, 0.1, self.n_features).astype(np.float64) elif initialization_mode_D == "one": weights_initialization_D = np.ones(self.n_features, dtype=np.float64) elif initialization_mode_D == "zero": weights_initialization_D = np.zeros(self.n_features, dtype=np.float64) elif initialization_mode_D == "BM25": weights_initialization_D = np.ones(self.n_features, dtype=np.float64) self.ICM = self.ICM.astype(np.float32) self.ICM = okapi_BM_25(self.ICM) elif initialization_mode_D == "TF-IDF": weights_initialization_D = np.ones(self.n_features, dtype=np.float64) self.ICM = self.ICM.astype(np.float32) self.ICM = TF_IDF(self.ICM) else: raise ValueError( "CFW_D_Similarity_Cython: 'init_type' not recognized") # Instantiate fast Cython implementation self.FW_D_Similarity = CFW_D_Similarity_Cython_SGD( self.row_list, self.col_list, self.data_list, self.n_features, self.ICM, precompute_common_features=precompute_common_features, positive_only_D=positive_only_D, weights_initialization_D=weights_initialization_D, use_dropout=use_dropout, dropout_perc=dropout_perc, learning_rate=learning_rate, l1_reg=l1_reg, l2_reg=l2_reg, sgd_mode=sgd_mode, verbose=self.verbose, gamma=gamma, beta_1=beta_1, beta_2=beta_2) if self.verbose: print(self.RECOMMENDER_NAME + ": Initialization completed") self.D_incremental = self.FW_D_Similarity.get_weights() self.D_best = self.D_incremental.copy() self._train_with_early_stopping(epochs, algorithm_name=self.RECOMMENDER_NAME, **earlystopping_kwargs) self.compute_W_sparse(model_to_use="best") sys.stdout.flush()