def kernel(input1, ker, arg, input2=None): if input2 is None: input1 = input1.T if ker == 'linear': K = kernels.linear_kernel(input1) # polynomial은 미구현. 파라미터가 좀 다른듯... # if ker == 'poly': # K = kernels.polynomial_kernel(input, ) if ker == 'rbf': gamma = (0.5 / (arg * arg)) K = kernels.rbf_kernel(input1, gamma=gamma) if ker == 'sigmoid': K = kernels.sigmoid_kernel(input1, gamma=arg[0], coef0=arg[1]) return K else: input1 = input1.T input2 = input2.T if ker == 'linear': K = kernels.linear_kernel(input1, input2) # polynomial은 미구현. 파라미터가 좀 다른듯... # if ker == 'poly': # K = kernels.polynomial_kernel(input, ) if ker == 'rbf': gamma = (0.5 / (arg * arg)) K = kernels.rbf_kernel(input1, input2, gamma=gamma) if ker == 'sigmoid': K = kernels.sigmoid_kernel(input1, input2, gamma=arg[0], coef0=arg[1]) return K
def kernelMatrix(self, X, y=None): if self.K_type == 'linear': """ if y != None: if self.mu == None: reg = Lasso(self.param) #TODO change with a model for classification and let the possibility to specify regression or classification self_mu = reg.fit(X, y).coef_ self.Xtr = self.Xtr[:, mp.where(self_mu != 0)] self.X = self.X[:, mp.where(self_mu != 0)] """ if self.normalize: self.K = normalize(linear_kernel(X, self.Xtr)) else: self.K = linear_kernel(X, self.Xtr) return self.K if self.K_type == 'polynomial': if self.normalize: self.K = normalize( polynomial_kernel(X, self.Xtr, degree=self.param)) else: self.K = polynomial_kernel(X, self.Xtr, degree=self.param) return self.K if self.K_type == 'gaussian': if self.normalize: self.K = normalize(rbf_kernel(X, self.Xtr, gamma=self.param)) else: self.K = rbf_kernel(X, self.Xtr, gamma=self.param) return self.K if self.K_type == 'laplacian': if self.normalize: self.K = normalize( laplacian_kernel(X, self.Xtr, gamma=self.param)) else: self.K = laplacian_kernel(X, self.Xtr, gamma=self.param) return self.K if self.K_type == 'sigmoid': if self.normalize: self.K = normalize( sigmoid_kernel(X, self.Xtr, gamma=self.param)) else: self.K = sigmoid_kernel(X, self.Xtr, gamma=self.param) return self.K
def recommend(search_word): movie_df = pre_process() tfv = vectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words='english') tfv_matrix = tfv.fit_transform(movie_df['bow']) sig = sigmoid_kernel(tfv_matrix, tfv_matrix) index = pd.Series(movie_df.index, index=movie_df['original_title']).drop_duplicates() try: idx = index[search_word] sig_scores = list(enumerate(sig[idx])) sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True) sig_scores = sig_scores[1:15] movie_indices = [i[0] for i in sig_scores] return list(movie_df['original_title'].iloc[movie_indices]) except: return None
def product_recommendation(title): tfidf_v = TfidfVectorizer( max_features=None, strip_accents="unicode", analyzer="word", min_df=10, token_pattern=r"\w{1,}", ngram_range=(1, 3), #take the combination of 1-3 different kind of words stop_words="english") products["description"] = products["description"].fillna("") products["product_name"] = products["product_name"].str.lower() tfidf_matrix = tfidf_v.fit_transform(products["description"]) sig = sigmoid_kernel(tfidf_matrix, tfidf_matrix) indices = pd.Series(products.index, index=products["product_name"]).drop_duplicates() index = indices.get(title.lower()) if index is not None: sorted_sig_scores = list(enumerate(sig[index])) sorted_sig_scores = sorted(sorted_sig_scores, key=lambda item: item[1], reverse=True) top_10_products = [sorted_sig_scores[i][0] for i in range(0, 11)] return products["product_name"].iloc[top_10_products].unique() return [index]
def reset_vec(self, kernel='rbf_ap'): feat = preprocessing.scale(self.fake_features) count = feat.shape[1] if kernel == 'rbf': temp = rbf_kernel(feat, gamma=1.0 / count).sum(axis=0) elif kernel == 'cos': temp = ((cosine_similarity(feat) + 1) / 2.0).sum(axis=0) elif kernel == 'euc': temp = (1.0 / (euclidean_distances(feat) + 1)).sum(axis=0) elif kernel == 'sigmoid': Sig = sigmoid_kernel(feat, coef0=0, gamma=1.0 / count) temp = ((Sig + 1.0) / 2.0).sum(axis=0) elif kernel == 'rbf_ap': gamma = 1.0 / count expVec = np.exp(-gamma * np.einsum("ij, ij -> i", feat, feat)) feaVec = np.einsum("i, ij -> j", expVec, feat) * (2.0 * gamma) outMat = np.einsum("i,ij,ik->jk", expVec, feat, feat) outMat *= (2.0 * gamma**2) first = expVec * np.sum(expVec) second = np.einsum("i, j, ij -> i", expVec, feaVec, feat) third = np.einsum("i, jk, ij, ik -> i", expVec, outMat, feat, feat) temp = first + second + third return (temp / np.sum(temp))
def _apply_kernel(self, x, y): """Apply the selected kernel function to the data.""" if self.kernel == 'linear': phi = linear_kernel(x, y) elif self.kernel == 'rbf': phi = rbf_kernel(x, y, self.coef1) elif self.kernel == 'poly': phi = polynomial_kernel(x, y, self.degree, self.coef1, self.coef0) elif self.kernel == 'sigmoid': coef0 = self.coef0 if self.coef0 is not None else 1 phi = sigmoid_kernel(x, y, self.gamma, coef0) elif self.kernel == 'chi2': gamma = self.gamma if self.gamma is not None else 1 phi = chi2_kernel(x, y, self.gamma) elif self.kernel == 'laplacian': phi = laplacian_kernel(x, y, self.gamma) elif callable(self.kernel): phi = self.kernel(x, y) if len(phi.shape) != 2: raise ValueError( "Custom kernel function did not return 2D matrix") if phi.shape[0] != x.shape[0]: raise ValueError( "Custom kernel function did not return matrix with rows" " equal to number of data points." "") else: raise ValueError("Kernel selection is invalid.") if self.bias_used: phi = np.append(phi, np.ones((phi.shape[0], 1)), axis=1) return phi
def tf_sig(tfidf_matrix): print "Using TFIDF with sigmoid kernel" Ke = sigmoid_kernel(tfidf_matrix[0:1],tfidf_matrix) K=Ke[0] top = np.argsort(K)[-11:] for i in range(10): print (10-i),Total[top[9-i]-1]
def _get_kernel_matrix(self, X1, X2): # K is len(X1)-by-len(X2) matrix if self._kernel == 'rbf': K = pairwise.rbf_kernel(X1, X2, gamma=self._gamma) elif self._kernel == 'poly': K = pairwise.polynomial_kernel(X1, X2, degree=self._degree, gamma=self._gamma, coef0=self._coef0) elif self._kernel == 'linear': K = pairwise.linear_kernel(X1, X2) elif self._kernel == 'laplacian': K = pairwise.laplacian_kernel(X1, X2, gamma=self._gamma) elif self._kernel == 'chi2': K = pairwise.chi2_kernel(X1, X2, gamma=self._gamma) elif self._kernel == 'additive_chi2': K = pairwise.additive_chi2_kernel(X1, X2) elif self._kernel == 'sigmoid': K = pairwise.sigmoid_kernel(X1, X2, gamma=self._gamma, coef0=self._coef0) else: print('[Error] Unknown kernel') K = None return K
def hsh_sig(hash_matrix): print "Using Hashing with sigmoid kernel" Ke = sigmoid_kernel(hash_matrix[0:1],hash_matrix) K=Ke[0] top = np.argsort(K)[-11:] for i in range(10): print (10-i),Total[top[9-i]-1]
def cal_km(params, X_fit, X, type): if type == 'interface': if params['kernel'] == 'linear': km = linear_kernel(X_fit, X) elif params['kernel'] == 'rbf': km = rbf_kernel(X_fit, X, gamma=params['gamma']) elif params['kernel'] == 'poly': km = polynomial_kernel(X_fit, X, gamma=params['gamma'], coef0=0.0) elif params['kernel'] == 'sigmoid': km = sigmoid_kernel(X_fit, X, gamma=params['gamma'], coef0=0.0) else: print('Unknown kernel') km = None elif type == 'realize': if params['kernel'] == 'linear': km = cal_linear(X_fit, X) elif params['kernel'] == 'rbf': km = cal_rbf(X_fit, X, gamma=params['gamma']) elif params['kernel'] == 'poly': km = cal_poly(X_fit, X, gamma=params['gamma']) elif params['kernel'] == 'sigmoid': km = cal_sigmoid(X_fit, X, gamma=params['gamma']) else: print('Unknown kernel') km = None else: print('Unknown type') km = None return km
def cnt_sig(count_matrix): print "Using Count with sigmoid kernel" Ke = sigmoid_kernel(count_matrix[0:1],count_matrix) K=Ke[0] top = np.argsort(K)[-11:] for i in range(10): print (10-i),Total[top[9-i]-1]
def margin_kernel(self, X1, kernel_type = 'linear', gamma =1.0): """ Forms the kernel matrix using the samples X1 Parameters: ---------- X1: np.ndarray data (n_samples,n_features) to form a kernel of shape (n_samples,n_samples) kernel_type : str type of kernel to be used gamma: float kernel parameter Returns: ------- X: np.ndarray the kernel of shape (n_samples,n_samples) """ if(kernel_type == 'linear'): X = linear_kernel(X1,X1) elif(kernel_type == 'rbf'): X = rbf_kernel(X1,X1,gamma) elif(kernel_type == 'tanh'): X = sigmoid_kernel(X1,X1,-gamma) elif(kernel_type == 'sin'): # X = np.sin(gamma*manhattan_distances(X1,X1)) X = np.sin(gamma*pairwise_distances(X1,X1)**2) elif(kernel_type =='TL1'): X = np.maximum(0,gamma - manhattan_distances(X1,X1)) else: print('no kernel_type, returning None') return None return X
def kernel_mean_matching(self, X, Z, kern='lin', B=1.0, eps=None): nx = X.shape[0] nz = Z.shape[0] print("nx: ", nx, " nz: ", nz) if eps == None: eps = B / math.sqrt(nz) if kern == 'lin': K = np.dot(Z, Z.T) K = K.todense() kappa = np.sum(np.dot(Z, X.T) * float(nz) / float(nx), axis=1) elif kern == 'rbf': K = sk.rbf_kernel(Z, Z) kappa = np.sum(sk.rbf_kernel(Z, X), axis=1) * float(nz) / float(nx) elif kern == 'poly': K = sk.polynomial_kernel(Z, Z) kappa = np.sum(sk.polynomial_kernel(Z, X), axis=1) * float(nz) / float(nx) elif kern == 'laplacian': K = sk.laplacian_kernel(Z, Z) kappa = np.sum(sk.laplacian_kernel(Z, X), axis=1) * float(nz) / float(nx) elif kern == 'sigmoid': K = sk.sigmoid_kernel(Z, Z) kappa = np.sum(sk.sigmoid_kernel(Z, X), axis=1) * float(nz) / float(nx) else: raise ValueError('unknown kernel') K = K.astype(np.double) K = matrix(K) kappa = matrix(kappa) G = matrix(np.r_[np.ones((1, nz)), -np.ones((1, nz)), np.eye(nz), -np.eye(nz)]) h = matrix(np.r_[nz * (1 + eps), nz * (eps - 1), B * np.ones((nz, )), np.zeros((nz, ))]) print("starting solver") solvers.options['show_progress'] = False sol = solvers.qp(K, -kappa, G, h) print(sol) coef = np.array(sol['x']) return coef
def rec_hotel(amenities, city): my_dataframe = pd.DataFrame({ 'Rating': travel_df['Rating'], 'Amenities': travel_df['Amenities'], 'Hotel Names': travel_df['Hotel Names'], 'City': travel_df['City'], 'Address': travel_df['Address'] }) df = pd.DataFrame({ "Rating": 2, "Amenities": amenities, "Hotel Names": ['abc'], "City": [city] }) # print(df) my_dataframe = my_dataframe.append(df, ignore_index=True) my_dataframe = my_dataframe[my_dataframe['City'] == city] my_dataframe.reset_index(inplace=True) # print(my_dataframe.iloc[-1]) # print(my_dataframe) # print(df.columns) tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words='english') # Filling NaNs with empty string my_dataframe['Amenities'] = my_dataframe['Amenities'].fillna('') # print(amenities) # Fitting the TF-IDF on the 'Amenities' text tfv_matrix = tfv.fit_transform(my_dataframe['Amenities']) # Compute the sigmoid kernel sig = sigmoid_kernel(tfv_matrix, tfv_matrix) my_ratings = np.array( my_dataframe[my_dataframe['City'] == city]['Rating']) / 5 indices = pd.Series( my_dataframe.index, index=my_dataframe['Hotel Names']).drop_duplicates() # print(indices) idx = indices['abc'] # l=np.add(sig[idx]*0.5,my_ratings) # print("rating:",my_ratings.shape,"sig:",sig[idx].shape) # Get the pairwsie similarity scores sig_scores = list(enumerate(sig[idx])) # Sort the hotels sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True) # Scores of the 5 most similar hotels sig_scores = sig_scores[2:7] # Movie indices hotel_indices = [i[0] for i in sig_scores] my_dataframe = my_dataframe.iloc[:-1, :] return my_dataframe[['Hotel Names', 'Address', 'Rating']].iloc[hotel_indices]
def abc(): data = pd.read_csv('final.csv') tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 3), stop_words = 'english') tfv_matrix = tfv.fit_transform(data['DESCRIPTION']) sig = sigmoid_kernel(tfv_matrix, tfv_matrix) indices = pd.Series(data.index, index = data['TRACK NAME']).drop_duplicates() return data, sig, indices
def ResetProbVec(self, kernel='rbf_ap'): """ Calculate the reset probability vector with assigned kernel rbf: Radial basis function cos: (cosine similarity + 1) / 2.0 euc: 1.0 / (1 + euclidean distances) sigmoid: (tanh(gamma <X_i, X_j>) + 1) / 2.0 rbf_ap: Taylor-expansion approximated Radial basis function """ if kernel == 'rbf': RBF = rbf_kernel(self.featMat, gamma=1.0 / self.featCount) RBF = RBF.sum(axis=0) resetProbVec = RBF / np.sum(RBF) elif kernel == 'cos': Cos = (cosine_similarity(self.featMat) + 1) / 2.0 Cos = Cos.sum(axis=0) resetProbVec = Cos / np.sum(Cos) elif kernel == 'euc': Euc = 1.0 / (euclidean_distances(self.featMat) + 1) Euc = Euc.sum(axis=0) resetProbVec = Euc / np.sum(Euc) elif kernel == 'sigmoid': gamma = 1.0 / self.featCount Sig = sigmoid_kernel(self.featMat, coef0=0, gamma=gamma) Sig = (Sig + 1.0) / 2.0 Sig = Sig.sum(axis=0) resetProbVec = Sig / np.sum(Sig) elif kernel == 'rbf_ap': parameter = 1.0 / self.featCount # w lengths = np.einsum("ij, ij -> i", self.featMat, self.featMat) expNormVector = np.exp(-parameter * lengths) # y f_normVec = np.einsum("i, ij -> j", expNormVector, self.featMat) featureNormVector = f_normVec * (2.0 * parameter) # Z outerMat = np.einsum("i, ij, ik -> jk", expNormVector, self.featMat, self.featMat) featureOuterNorm = outerMat * (2.0 * parameter**2) # r' first = expNormVector * np.sum(expNormVector) second = np.einsum("i, j, ij -> i", expNormVector, featureNormVector, self.featMat) third = np.einsum("i, jk, ij, ik -> i", expNormVector, featureOuterNorm, self.featMat, self.featMat) resetProbVec = first + second + third # r resetProbVec /= np.sum(resetProbVec) self.resetProbVec = resetProbVec
def kernel_func(X1, X2, kernel_name, gamma, d, r): if kernel_name == 'rbf': return rbf_kernel(X1, X2, gamma=gamma) elif kernel_name == 'polynomial': return polynomial_kernel(X1, X2, gamma=gamma, degree=d, coef0=r) elif kernel_name == 'sigmoid': return sigmoid_kernel(X1, X2, gamma=gamma, coef0=r) elif kernel_name == 'linear': return linear_kernel(X1, X2) else: raise NotImplementedError
def gen_similarity(args, X): if args.sim_method == 'sigmoid_kernel': sim_UXU = sigmoid_kernel(X=X, Y=None, gamma=None, coef0=1) sim_MXM = sigmoid_kernel(X=X.T, Y=None, gamma=None, coef0=1) elif args.sim_method == 'cosine_similarity': sim_UXU = cosine_similarity(X=X, Y=None) sim_MXM = cosine_similarity(X=X.T, Y=None) ## ===================================================================== # Save similarity matrix fn_str = args.RESULTPATH + 'sim_%s_UXU.npy' % (args.sim_method) with open(fn_str, 'wb') as f: pickle.dump(sim_UXU, f) fn_str = args.RESULTPATH + 'sim_%s_MXM.npy' % (args.sim_method) with open(fn_str, 'wb') as f: pickle.dump(sim_MXM, f) print('saving similarity matrix is done!') ## ===================================================================== return sim_UXU, sim_MXM
def calc_gaussian_sim(data_matrix, method): if method == "rbf": return rbf_kernel(data_matrix) elif method == "chi2": return chi2_kernel(data_matrix) elif method == "laplacian": return laplacian_kernel(data_matrix) elif method == "sigmoid": return sigmoid_kernel(data_matrix) else: raise ValueError("Wron method parameter ind calc_gaussian_sim()")
def transform(self, X, Y): if self.type == 'rbf': return rbf_kernel(X, Y, self.gamma)[0] elif self.type == 'Chi2': return chi2_kernel(X, Y, self.gamma)[0] elif self.type == 'AChi2': return -additive_chi2_kernel(X, Y)[0] elif self.type == 'laplacian': return laplacian_kernel(X, Y, self.gamma)[0] elif self.type == 'sigmoid': return sigmoid_kernel(X, Y, self.gamma, self.coef0)[0]
def calculate_gram_matrix(x, kernel='linear', gamma=0, degree=0, coef0=0): if kernel == 'linear': gram = linear_kernel(x, x) elif kernel == 'poly': gram = polynomial_kernel(x, x, degree=degree, gamma=gamma, coef0=coef0) elif kernel == 'sigmoid': gram = sigmoid_kernel(x, x, gamma=gamma, coef0=coef0) elif kernel == 'rbf': gram = rbf_kernel(x, x, gamma=gamma) else: raise ValueError return gram
def _apply_kernel(self, X, Y): if self.kernel == "rbf": return rbf_kernel(X, Y, self.gamma) elif self.kernel == "sigmoid": return sigmoid_kernel(X, Y, self.gamma, self.coef0) elif self.kernel == "poly": return polynomial_kernel(X, Y, self.degree, self.gamma, self.coef0) elif self.kernel == "linear": return linear_kernel(X, Y) elif callable(self.kernel): return self.kernel(X, Y) else: raise ValueError("Unknown kernel: " + str(self.kernel))
def preprocessing(): # Model defination with the feature declaration tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 5), stop_words='english') # Filling NaNs with empty string js['ArticleTitle'] = js['ArticleTitle'].fillna('') tfv_matrix = tfv.fit_transform(js['ArticleTitle']) # Compute the sigmoid kernel sig = sigmoid_kernel(tfv_matrix, tfv_matrix) # Generate the indices for the recommender system, removing the duplicates indices = pd.Series(js.index, index=js['ArticleFullPath' ]).drop_duplicates() #returning the indices and sigmoid kernel matrix return indices, sig
def call_recommend(m): m = m.lower() movie = pd.read_csv('movie_data_final.csv') # check if the movie is in our database or not if m not in movie['original_title'].unique(): return ( 'This movie is not in our database.\nPlease check if you spelled it correct.' ) else: ## Content Based Recommendation System ### Using Tf-IDF Vectorizer to formulate vectorization matrix tf = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words='english') #Fitting the TF-IDF on 'overview' text tf_matrix = tf.fit_transform(movie['overview'].values.astype('U')) #Compute sigmoid kernel sig = sigmoid_kernel(tf_matrix, tf_matrix) #Reverse mapping of indices and movie titles indices = pd.Series(movie.index, index=movie['original_title']).drop_duplicates() #Get the corresponding to original_title index = indices[m] #Get the pairwise similiarity scores sig_scores = list(enumerate(sig[index])) #Sort the movies sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True) #Score of 10 most similar movies sig_scores = sig_scores[1:11] #Movie indices movie_indices = [i[0] for i in sig_scores] movieList = movie['original_title'].iloc[movie_indices] #movieList.columns = ['Movie Name','Rating'] #movieList = movieList.sort_values(['Rating'],ascending=False) #Top 10 most similar movies return movieList
def recommendation(input_json, input_param): # FETCHING DATA FROM API counter = 0 job_recommended = [] jobs = input_json # Getting user input - insert to list input_value = [{'title': 'INPUT', 'description': input_param}] title = input_value[0].get('title') added_jobs = jobs.append(input_value, ignore_index=True, sort=False) # Term frequency and inverse document frequency tfv = TfidfVectorizer(min_df=0, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words='english') added_jobs['description'] = added_jobs['description'].fillna('') tfv_matrix = tfv.fit_transform(added_jobs['description']) # Sigmoid kernel for Sigmoid calculationsa sig = sigmoid_kernel(tfv_matrix, tfv_matrix) indices = pd.Series(added_jobs.index, index=added_jobs['title']) # Search for the very peculiar input jobs_list_length = len(added_jobs) jobs_last = jobs_list_length - 1 for i in range(jobs_list_length): # 0.7615941559557649 is sigmoid value for very peculiar input if sig[jobs_last][i] <= 0.7615941559557649: counter += 1 if counter >= jobs_last: return None else: # Getting final result id_input = indices[title] sig_scores = list(enumerate(sig[id_input])) sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True) sig_scores = sig_scores[1:15] job_indices = [i[0] for i in sig_scores] for job in job_indices: job_dic = {'title': added_jobs['title'].iloc[job]} job_recommended.append(job_dic) # # Return all recommended titles return job_recommended
def get_recommendations(movie_id): sql_engine = create_engine(os.path.join('sqlite:///' + os.path.join(basedir, 'site.db')), echo=False) movies_results = pd.read_sql_query('select movie_id,genres from Movie', sql_engine) movies_results.to_csv(os.path.join(basedir, 'Movie.csv'), index=False, sep=";") user_movies_results = pd.read_sql_query('select movie_id from Credits', sql_engine) user_movies_results.to_csv(os.path.join(basedir, 'Credits.csv'), index=False, sep=";") movies_df = pd.read_csv(os.path.join(basedir, 'Movie.csv'), sep=';') user_df = pd.read_csv(os.path.join(basedir, 'Credits.csv'), sep=';') user_df.drop_duplicates(subset='movie_id', inplace=True) user_df.reset_index(drop=True, inplace=True) movies_df_merge = movies_df.merge(user_df, on='movie_id') tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words='english') tfv_matrix = tfv.fit_transform(movies_df_merge['genres']) indices = pd.Series(movies_df_merge.index, index=movies_df_merge['movie_id']).drop_duplicates() sig = sigmoid_kernel(tfv_matrix, tfv_matrix) # Get the index corresponding to original_title idx = indices[movie_id] # Get the pairwsie similarity scores sig_scores = list(enumerate(sig[idx])) # Sort the movies sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True) # Scores of the 10 most similar movies sig_scores = sig_scores[1:] # Movie indices movie_indices = [i[0] for i in sig_scores] # Top 10 most similar movies rem_movie = list(movies_df_merge['movie_id'].iloc[movie_indices]) return rem_movie
def tanhFunc(): if self.parameters["kernel"].__contains__("gamma"): g = self.parameters["kernel"]["gamma"] else: g = 0.01 if self.parameters["kernel"].__contains__("offset"): c = self.parameters["kernel"]["offset"] else: c = 1 K = smp.sigmoid_kernel(X, Y, gamma=g, coef0=c) return K
def train_and_test (self, dataTest, realOutput=None, aval=False, reg=0.01, deg=3, gamm=None, coef=1): if self.kernelType == 'rbf': K = rbf_kernel(self.inTrain, self.inTrain, gamm) Ktest = rbf_kernel(dataTest, self.inTrain, gamm) elif self.kernelType == 'pol': K = polynomial_kernel(self.inTrain, self.inTrain, deg, gamm, coef) Ktest = polynomial_kernel(dataTest, self.inTrain, deg, gamm, coef) elif self.kernelType == 'sig': K = sigmoid_kernel(self.inTrain, self.inTrain, gamm, coef) Ktest = sigmoid_kernel(dataTest, self.inTrain, gamm, coef) I = np.eye(self.inTrain.shape[0]) outNet = np.dot (np.dot(Ktest, np.linalg.inv(K + reg*I)), self.outTrain) if aval: miss = float(cont_error (realOutput, outNet)) si = float(outNet.shape[0]) acc = (1-miss/si)*100 print 'Miss classification on the test: ', miss, ' of ', si, ' - Accuracy: ',acc , '%' return outNet, acc return outNet, None
def process(): global movie_df, sig, index movie_df = pre_process() tfv = vectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words='english') tfv_matrix = tfv.fit_transform(movie_df['overview']) sig = sigmoid_kernel(tfv_matrix, tfv_matrix) index = pd.Series(movie_df.index, index=movie_df['title']).drop_duplicates()
def calculateMultipleKernel(x, y): theta = random.sample(range(1, 47), 46) # given a random theta for now # Convert our 2d arrays to numpy arrays x = np.array(x) y = np.array(y) # Reshape the array-like input vectors since we only have one sample x = x.reshape(1, -1) y = y.reshape(1, -1) # Variables to aggregate the kernel result kernelResult = 0 index = 0 for i in range(0, 3): kernelResult += theta[index] * additive_chi2_kernel(x, y) index += 1 for i in range(0, 3): kernelResult += theta[index] * chi2_kernel(x, y, theta[index + 1]) index += 2 for i in range(0, 3): kernelResult += theta[index] * cosine_similarity(x, y) index += 1 for i in range(0, 3): kernelResult += theta[index] * linear_kernel(x, y) index += 1 for i in range(0, 3): kernelResult += theta[index] * polynomial_kernel( x, y, theta[index + 1], theta[index + 2], theta[index + 3]) index += 4 for i in range(0, 3): kernelResult += theta[index] * rbf_kernel(x, y, theta[index + 1]) index += 2 for i in range(0, 3): kernelResult += theta[index] * laplacian_kernel(x, y, theta[index + 1]) index += 2 for i in range(0, 3): kernelResult += theta[index] * sigmoid_kernel(x, y, theta[index + 1]) index += 2 return kernelResult
def calculateMultipleKernel(x, y): theta = random.sample(range(1,47),46) # given a random theta for now # Convert our 2d arrays to numpy arrays x = np.array(x) y = np.array(y) # Reshape the array-like input vectors since we only have one sample x = x.reshape(1,-1) y = y.reshape(1,-1) # Variables to aggregate the kernel result kernelResult = 0; index = 0; for i in range(0,3): kernelResult += theta[index] * additive_chi2_kernel(x,y) index += 1 for i in range(0,3): kernelResult += theta[index] * chi2_kernel(x,y,theta[index+1]) index += 2 for i in range(0,3): kernelResult += theta[index] * cosine_similarity(x,y) index += 1 for i in range(0,3): kernelResult += theta[index] * linear_kernel(x,y) index += 1 for i in range(0,3): kernelResult += theta[index] * polynomial_kernel( x,y,theta[index+1],theta[index+2], theta[index+3]) index += 4 for i in range(0,3): kernelResult += theta[index] * rbf_kernel(x,y,theta[index+1]) index += 2 for i in range(0,3): kernelResult += theta[index] * laplacian_kernel(x,y,theta[index+1]) index += 2 for i in range(0,3): kernelResult += theta[index] * sigmoid_kernel(x,y,theta[index+1]) index += 2 return kernelResult
def compare_data(self): # getting all movies that are animated or not depending on the movie searched data = Movies.query.filter_by(animation=self.movie_details.animation) # Converting the list of SQLalchemy movies objects into a data frame of movies movie_data_frame = pd.DataFrame([(d.title, d.overview, d.image, d.popularity, d.release_date) for d in data], columns=['title', 'overview', 'image', 'popularity', 'release_date']) # Specifying parameters for the comparison tfv = TfidfVectorizer(min_df=1, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words='english') # Specify the column for the comparison tfv_matrix = tfv.fit_transform(movie_data_frame['overview']) # Form a matrix sig = sigmoid_kernel(tfv_matrix, tfv_matrix) # Remove duplicate movies and index them indices = pd.Series(movie_data_frame.index, index=movie_data_frame['title']).drop_duplicates() return movie_data_frame, sig, indices
def kernel_function(self, x1, x2): features = [] # linear kernel: # Cosine distance features += np.squeeze(1 - pairwise.paired_cosine_distances(x1, x2)[0]), # Manhanttan distance features += pairwise.paired_manhattan_distances(x1, x2)[0], # Euclidean distance features += pairwise.paired_euclidean_distances(x1, x2)[0], # Chebyshev distance features += pairwise.pairwise_distances(x1, x2, metric="chebyshev")[0][0], # stat kernel: # Pearson coefficient pearson = stats.pearsonr(np.squeeze(np.asarray(x1)), np.squeeze(np.asarray(x2)))[0] features += 0 if np.isnan(pearson) else pearson, # Spearman coefficient spearman = stats.spearmanr(x1, x2, axis=1).correlation features += 0 if np.isnan(spearman) else spearman, # Kendall tau coefficient kendall = stats.kendalltau(x1, x2).correlation features += 0 if np.isnan(kendall) else kendall, # non-linear kernel: # polynomial features += pairwise.polynomial_kernel(x1, x2, degree=2)[0][0], # rbf features += pairwise.rbf_kernel(x1, x2)[0][0], # laplacian features += pairwise.laplacian_kernel(x1, x2)[0][0], # sigmoid features += pairwise.sigmoid_kernel(x1, x2)[0][0], return features
def Recommendation_System(df,player_id,k): query = str(Playerdata.objects.all().query) df1 = pd.read_sql_query(query, connection) ID2namesmapper=df1.set_index('sofifa_id')['short_name'] sc=StandardScaler() df_sc=sc.fit_transform(df) kn=sigmoid_kernel(df_sc,df_sc) so_fifa_id=list(df.index) kn_df=pd.DataFrame(kn,index=so_fifa_id,columns=so_fifa_id) try: temp_dict=kn_df[player_id].to_dict() temp_list=list({k: v for k, v in sorted(temp_dict.items(), key=lambda item: item[1], reverse=True)}.keys()) temp_list.remove(player_id) return ID2namesmapper[temp_list[0:k]].to_list() except: print('PlayerID not present in the database')
def drawAlgoCompGraph(): h = 0.02 names = ["ridge", "KNN", "Linear SVM", "RBF SVM", "LDA", "Random Forest", "AdaBoost", "Naive Bayes", "QDA", "Logistic"] kernel_names =['laplacian kernel', 'RBF kernel', 'Sigmoid kernel'] classifiers = [ linear_model.Ridge(), KNeighborsClassifier(9), SVC(kernel="linear", C=0.025), SVC(kernel="rbf", gamma=0.25), LDA(), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), QDA(), linear_model.LogisticRegression()] filename = '/Users/guichengwu/Desktop/208_mid term/exam.dat' data = np.loadtxt(filename, dtype='str') for i in range(data.shape[0]): for j in range(1,data.shape[1]): data[i][j] = data[i][j][2:] data_matrix = np.matrix(data).astype(np.float) X = data_matrix[:, 1:5] y = np.asarray(data_matrix[:, 0]) X = preprocessing.scale(X) Lap_X = laplacian_kernel(X) pca1 = decomposition.PCA(n_components=2) pca1.fit(Lap_X) Lap_X = pca1.transform(Lap_X) RBF_X = rbf_kernel(X) pca2 = decomposition.PCA(n_components=2) pca2.fit(RBF_X) RBF_X = pca2.transform(RBF_X) Sig_X = sigmoid_kernel(X) pca3 = decomposition.PCA(n_components=2) pca3.fit(Sig_X) Sig_X = pca3.transform(Sig_X) linearly_separable1 = (Lap_X, y) linearly_separable2 = (RBF_X, y) linearly_separable3 = (Sig_X, y) datasets = [ linearly_separable1, linearly_separable2, linearly_separable3, ] figure = plt.figure(figsize=(30, 10)) i = 1 for kernel_name, ds in zip(kernel_names, datasets): X, y = ds X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max()+0.5 y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max()+0.5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) cm = plt.cm.RdBu cm_bright = ListedColormap(['#FF0000', '#0000FF']) ax = plt.subplot(len(datasets), len(classifiers)+1, i) ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) ax.scatter(X_test[:, 0], X_test[:,1], c=y_test, cmap=cm_bright, alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) ax.set_title(kernel_name) i += 1 # iterate over classifiers for name, clf in zip(names, classifiers): ax = plt.subplot(len(datasets), len(classifiers) + 1, i) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) if hasattr(clf, "decision_function"): Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8) ax.scatter(X_train[:, 0], X_train[:,1], c=y_train, cmap=cm_bright) ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) ax.set_title(name) ax.text(xx.max() - 0.3, yy.min() + 0.3, ('%.2f' % score).lstrip('0'), size=15, horizontalalignment='right') i += 1 figure.subplots_adjust(left=0.02, right=0.98) plt.show() figure.savefig('/Users/guichengwu/Desktop/algorithm_comparison2.png')