def pca(self): svd = SVD(arr) u, s, vt = svd.svd() variances = np.diag(s)**2 variances_sum = sum(variances) self.explained_var = variances / variances_sum self.loadings = u.T
def worker(fold, n_users, n_items, dataset_dir): traFilePath = dataset_dir + 'ratings__' + str(fold + 1) + '_tra.txt' trasR = loadSparseR(n_users, n_items, traFilePath) print( dataset_dir.split('/')[-2] + ':', trasR.shape, trasR.nnz, '%.2f' % (trasR.nnz / float(trasR.shape[0]))) tra_tuple = np.array([(user, item, trasR[user, item]) for user, item in np.asarray(trasR.nonzero()).T ]) # triad tstFilePath = dataset_dir + 'ratings__' + str(fold + 1) + '_tst.txt' tstsR = loadSparseR(n_users, n_items, tstFilePath) tst_tuple = np.array([(user, item, tstsR[user, item]) for user, item in np.asarray(tstsR.nonzero()).T ]) # triad sampler = Sampler(trasR=trasR, negRatio=.0, batch_size=batch_size) svd = SVD(n_users, n_items, eval_metrics, range_of_ratings, reg, n_factors, batch_size) scores = svd.train(fold + 1, tra_tuple, tst_tuple, sampler) print('fold=%d:' % fold, ','.join(['%s' % eval_metric for eval_metric in eval_metrics]), '=', ','.join(['%.6f' % (score) for score in scores])) return scores
def __init__(self, data, k=-1, rrank=0, crank=0): SVD.__init__(self, data,k=k,rrank=rrank, crank=rrank) # select all data samples for computing the error: # note that this might take very long, adjust self._rset and self._cset # for faster computations. self._rset = range(self._rows) self._cset = range(self._cols)
def __init__(self, data, k=-1, rrank=0, crank=0): SVD.__init__(self, data, k=k, rrank=rrank, crank=rrank) # select all data samples for computing the error: # note that this might take very long, adjust self._rset and self._cset # for faster computations. self._rset = range(self._rows) self._cset = range(self._cols)
def __init__(self, data, rrank=0, crank=0, show_progress=True): SVD.__init__(self, data, rrank=rrank, crank=rrank, show_progress=show_progress) # select all data samples for computing the error: # note that this might take very long, adjust self._rset and self._cset for # faster computations. self._rset = range(self._rows) self._cset = range(self._cols)
def _update_w(self): svd_mdl = SVD(self.data) svd_mdl.factorize() U, S, V = svd_mdl.U, svd_mdl.S, svd_mdl.V # The first left singular vector is nonnegative # (abs is only used as values could be all negative) self.W[:, 0] = np.sqrt(S[0, 0]) * np.abs(U[:, 0]) #The first right singular vector is nonnegative self.H[0, :] = np.sqrt(S[0, 0]) * np.abs(V[0, :].T) for i in range(1, self._num_bases): # Form the rank one factor Tmp = np.dot(U[:, i:i + 1] * S[i, i], V[i:i + 1, :]) # zero out the negative elements Tmp = np.where(Tmp < 0, 0.0, Tmp) # Apply 2nd SVD svd_mdl_2 = SVD(Tmp) svd_mdl_2.factorize() u, s, v = svd_mdl_2.U, svd_mdl_2.S, svd_mdl_2.V # The first left singular vector is nonnegative self.W[:, i] = np.sqrt(s[0, 0]) * np.abs(u[:, 0]) #The first right singular vector is nonnegative self.H[i, :] = np.sqrt(s[0, 0]) * np.abs(v[0, :].T)
def main(movies_path, scores_path, db_config_path='../config.ini'): movies_reindexed_path = movies_path + "_reindexed" scores_reindexed_path = scores_path + "_reindexed" Utils.reindex_movies(movies_path, scores_path, movies_reindexed_path, scores_reindexed_path) f = open(scores_reindexed_path, 'r') scores = {} for line in f: line_split = line.split(',') user_id = int(line_split[0]) movie_id = int(line_split[1]) score = float(line_split[2]) scores[user_id, movie_id] = score f.close() svd = SVD() svd.train(scores) similarities = svd.find_top_similar_items() db_config = ConfigParser() db_config.read(db_config_path) similarities_dao = SimilaritiesDao({ "host": db_config.get("mysql", "host"), "user": db_config.get("mysql", "user"), "passwd": db_config.get("mysql", "passwd"), "db": db_config.get("mysql", "db") }) movies = Utils.reindexed_movie_titles(movies_reindexed_path) normalized_movies = {} for (movie_id, title) in enumerate(movies): normalized_movies[Utils.normalize_title(title)] = movie_id similarities_dao.store_movies(movies) similarities_dao.store_normalized_movies(normalized_movies) similarities_dao.store_similarities(similarities) similarities_dao.close()
def update_w(self): # compute eigenvectors and eigenvalues using SVD svd_mdl = SVD(self.data) svd_mdl.factorize() # argsort sorts in ascending order -> do reverese indexing # for accesing values in descending order S = np.diag(svd_mdl.S) order = np.argsort(S)[::-1] # select only a few eigenvectors ... if self._num_bases >0: order = order[:self._num_bases] self.W = svd_mdl.U[:,order] self.eigenvalues = S[order]
def _update_w(self): # compute eigenvectors and eigenvalues using SVD svd_mdl = SVD(self.data) svd_mdl.factorize() # argsort sorts in ascending order -> do reverese indexing # for accesing values in descending order S = np.diag(svd_mdl.S) order = np.argsort(S)[::-1] # select only a few eigenvectors ... if self._num_bases >0: order = order[:self._num_bases] self.W = svd_mdl.U[:,order] self.eigenvalues = S[order]
def main(args): import optparse parser = optparse.OptionParser() parser.usage = __doc__ parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") parser.add_option("-l", "--load", action="store_true", dest="load", help="load from a cache file") parser.add_option("-c", "--cache", action="store_true", dest="cache", help="build a cache of data") parser.add_option("-f", "--features", action="store", type=int, dest="nFeatures", default=10, help="user nfeatures") parser.add_option("-e", "--epochs", action="store", type=int, dest="nepochs", default=10, help="train through nepochs") parser.add_option("-s", "--slow", action="store_true", dest="slow", help="use non cython model (probably will be obsolete") parser.add_option("-S", "--size", action="store_true", dest="getsize", default=False, help="print np.ndarray sizes and exit") parser.add_option("-p", "--nprocs", action="store", dest="nprocs", type=int, default=1, help="run in threaded mode",) (options, args) = parser.parse_args() if len(args) < 1: parser.error("Not enough arguments given") if options.load: svd = SVD.load(args[0]) else: svd = SVD(args[0], options.nFeatures) if options.cache: svd.dump() elif options.getsize: size = svd.getsize() print "%d bytes, %dMB" % (size, size / 2.**20) else: if options.nprocs == 1: svd.train_all(options.nepochs) else: import mpsvd mpsvd.runManySVDs(args[0], options.nprocs, options.nepochs) svd.validate() return 0
def U_calc(self): """generates the u matrix with the help of intersection of the C and R matrix found by selecting rows and columns randomly with a certaing probability associated with the selection of that row or column""" self.ucal = self.R[:, self.c] svd = SVD() svd.Ucalc(self.ucal, self.ucal.transpose()) svd.Vcalc(self.ucal, self.ucal.transpose()) svd.Sigma() sigma = svd.sigma for i in range(0, max(svd.rank_u, svd.rank_v)): if sigma[i, i] != 0: sigma[i, i] = (1 / sigma[i, i]) self.U = (svd.V.transpose()) * (sigma) * (sigma) * (svd.U.transpose())
def calculate(self): self.allPredicts = np.zeros((4, self.testSize)) bias = Bias(self.trainData, self.testData) bias.calculateBias() answers, predicts = bias.predict() self.biasClass = bias self.allPredicts[0, :] = predicts #print("Bias: %f" % evaluationRMSE(answers, predicts)) similarity = Similarity(self.trainData, self.testData) similarity.calculateBias() similarity.calcSimiMatrix() answers, predicts = similarity.predict() self.similarityClass = similarity self.allPredicts[1, :] = predicts #print("Similarity: %f" % evaluationRMSE(answers, predicts)) svd = SVD(self.trainData, self.testData) svd.generaterMat() svd.calcSVD() answers, predicts = svd.predict() self.svdClass = svd self.allPredicts[2, :] = predicts #print("SVD: %f" % evaluationRMSE(answers, predicts)) matFactory = MatFactory(self.trainData, self.testData) matFactory.train(10, 11) answers, predicts = matFactory.predict() self.matFactoryClass = matFactory self.allPredicts[3, :] = predicts #print("MatFactory: %f" % evaluationRMSE(answers, predicts)) pickleFile = open(predictsFile, 'wb') pickle.dump(self.allPredicts, pickleFile)
def load_svd(file): """ Load the SVD file Load a SVD file. If the file name is a valid path, the function will load the file. Otherwise, the function will try to get the SVD file from package. If the function can't find a SVD file, this raises a FileNotFoundError error. :param file: The name of the file to open to load the SVD """ try: if os.path.exists(file): svd = SVD(file) else: svd = SVDText(open_resource(None, file).read()) svd.parse() return svd except OSError: raise FileNotFoundError
def comp_prob(d, k): # compute statistical leverage score c = np.round(k - k / 5.0) svd_mdl = SVD(d, k=c) svd_mdl.factorize() if scipy.sparse.issparse(self.data): A = svd_mdl.V.multiply(svd_mdl.V) ## Rule 1 pcol = np.array(A.sum(axis=0) / k) else: A = svd_mdl.V[:k, :]**2.0 ## Rule 1 pcol = A.sum(axis=0) / k #c = k * np.log(k/ (self._eps**2.0)) #pcol = c * pcol.reshape((-1,1)) pcol /= np.sum(pcol) return pcol
def comp_prob(d, k): # compute statistical leverage score c = np.round(k - k/5.0) svd_mdl = SVD(d, k=c) svd_mdl.factorize() if scipy.sparse.issparse(self.data): A = svd_mdl.V.multiply(svd_mdl.V) ## Rule 1 pcol = np.array(A.sum(axis=0)/k) else: A = svd_mdl.V[:k,:]**2.0 ## Rule 1 pcol = A.sum(axis=0)/k #c = k * np.log(k/ (self._eps**2.0)) #pcol = c * pcol.reshape((-1,1)) pcol /= np.sum(pcol) return pcol
def _update_w(self): svd_mdl = SVD(self.data) svd_mdl.factorize() U, S, V = svd_mdl.U, svd_mdl.S, svd_mdl.V # The first left singular vector is nonnegative # (abs is only used as values could be all negative) self.W[:,0] = np.sqrt(S[0,0]) * np.abs(U[:,0]) #The first right singular vector is nonnegative self.H[0,:] = np.sqrt(S[0,0]) * np.abs(V[0,:].T) for i in range(1,self._num_bases): # Form the rank one factor Tmp = np.dot(U[:,i:i+1]*S[i,i], V[i:i+1,:]) # zero out the negative elements Tmp = np.where(Tmp < 0, 0.0, Tmp) # Apply 2nd SVD svd_mdl_2 = SVD(Tmp) svd_mdl_2.factorize() u, s, v = svd_mdl_2.U, svd_mdl_2.S, svd_mdl_2.V # The first left singular vector is nonnegative self.W[:,i] = np.sqrt(s[0,0]) * np.abs(u[:,0]) #The first right singular vector is nonnegative self.H[i,:] = np.sqrt(s[0,0]) * np.abs(v[0,:].T)
def main(movies_path, scores_path, db_config_path='../config.ini'): movies_reindexed_path = movies_path + "_reindexed" scores_reindexed_path = scores_path + "_reindexed" Utils.reindex_movies(movies_path, scores_path, movies_reindexed_path, scores_reindexed_path) f = open(scores_reindexed_path, 'r') scores = {} for line in f: line_split = line.split(',') user_id = int(line_split[0]) movie_id = int(line_split[1]) score = float(line_split[2]) scores[user_id, movie_id] = score f.close() svd = SVD() svd.train(scores) similarities = svd.find_top_similar_items() db_config = ConfigParser() db_config.read(db_config_path) similarities_dao = SimilaritiesDao({"host": db_config.get("mysql", "host"), "user": db_config.get("mysql", "user"), "passwd": db_config.get("mysql", "passwd"), "db": db_config.get("mysql", "db")}) movies = Utils.reindexed_movie_titles(movies_reindexed_path) normalized_movies = {} for (movie_id, title) in enumerate(movies): normalized_movies[Utils.normalize_title(title)] = movie_id similarities_dao.store_movies(movies) similarities_dao.store_normalized_movies(normalized_movies) similarities_dao.store_similarities(similarities) similarities_dao.close()
def main(args): summarizer = { 'tfidf': TfIdf(), 'cluster': Cluster(), 'svd': SVD(), 'pagerank': PageRank() }[args['alg']] summarizer.initialize(args['tf'], args['df']) summary = summarizer.summarize(args['doc']) for s in summary: print(s),
def calAll(self): self.errs = [0] * 5 bias = Bias(self.data, self.test) bias.calculateBias() answers, predicts = bias.predict() err = evaluationRMSE(answers, predicts) self.errs[0] = err print("Bias: %f" % err) similarity = Similarity(self.data, self.test) similarity.calculateBias() similarity.calcSimiMatrix() answers, predicts = similarity.predict() err = evaluationRMSE(answers, predicts) self.errs[1] = err print("Similarity: %f" % err) svd = SVD(self.data, self.test) svd.generaterMat() svd.calcSVD() answers, predicts = svd.predict() err = evaluationRMSE(answers, predicts) self.errs[2] = err print("SVD: %f" % err) matFactory = MatFactory(self.data, self.test) matFactory.train(20, 35) answers, predicts = matFactory.predict() err = evaluationRMSE(answers, predicts) self.errs[3] = err print("MatFactory: %f" % evaluationRMSE(answers, predicts)) combination = Combination(self.data) combination.separateData() combination.calculate() combination.train(alpha=0.01, iter=10000) answers, predicts = combination.predict(self.test) err = evaluationRMSE(answers, predicts) self.errs[4] = err print("Combination: %f" % err) return self.errs
def run_single_comparison(df_dict): X = [] GM_Y = [] SVD_Y = [] for df in df_dict: X.append(df_dict[df].columns.size) gm_start = time.time() Gaussian_Mixture(df_dict[df]) gm_time = time.time() - gm_start GM_Y.append(gm_time) #print("\nTime to run GM: on size ", df_dict[df].columns.size, ": ", gm_time, " seconds") svd_start = time.time() SVD(0,10,df_dict[df]) svd_time = time.time() - svd_start SVD_Y.append(svd_time) #print("Time to run SVD on size ", df_dict[df].columns.size, ": ", svd_time, " seconds\n") print(df_dict[df].columns.size, " calculated.") return X, GM_Y, SVD_Y
def on_enter(self): # Update user ratings global visited_p1, visited_p2 visited_p2 = 1 if visited_p1 == 0: for x in initializeRatings: userRatings.append(float(initializeRatings.get(x))) data.iloc[-1] = userRatings user_likes = get_likes(data) last_user = list(user_likes.keys())[-1] u_likes = user_likes[last_user] m_likes = [] recommendations_l = [] count = 0 for m in u_likes: # stop at 10 movies if count == 10: break # find the movie index number m_index = data.columns.get_loc(m) print(m_index) start = time.time() similar_movies = SVD(m_index, 10, data) print("Time to calculate SVD: ", time.time() - start, " seconds.") m_likes.extend(similar_movies) #print(m_likes) count += 1 recommendations_l = [ movie for movie, count in Counter(m_likes).most_common(15) ] #return page layout to display layout = GridLayout(cols=4) new_layout = getlayout(layout, recommendations_l, 'SVD') self.add_widget(new_layout)
def run_multi_comparison(dataFrames): interval = int(40/(len(dataFrames) - 1)) X = [] GM_Y = [] SVD_Y = [] GM_time = 0 for i in range(len(dataFrames)): if i == 0: GM_start = time.time() Gaussian_Mixture(dataFrames[i]) GM_time = time.time() - GM_start else: GM_Y.append(GM_time) X.append(i * interval) SVD_start = time.time() for j in range(i * interval): SVD(j,10,dataFrames[i]) SVD_time = time.time() - SVD_start SVD_Y.append(SVD_time) print(i * interval, " calculated.") return X, GM_Y, SVD_Y
def calAll(self): self.errs = [0] * 5 bias = Bias(self.data, self.test) bias.calculateBias() answers, predicts = bias.predict() err = evaluationRMSE(answers, predicts) self.errs[0] = err print("Bias: %f" % err) similarity = Similarity(self.data, self.test) similarity.calculateBias() similarity.calcSimiMatrix() answers, predicts = similarity.predict() err = evaluationRMSE(answers, predicts) self.errs[1] = err print("Similarity: %f" % err) svd = SVD(self.data, self.test) svd.generaterMat() svd.calcSVD() answers, predicts = svd.predict() err = evaluationRMSE(answers, predicts) self.errs[2] = err print("SVD: %f" % err) matFactory = MatFactory(self.data, self.test) matFactory.train(20, 35) answers, predicts = matFactory.predict() err = evaluationRMSE(answers, predicts) self.errs[3] = err print("MatFactory: %f" % evaluationRMSE(answers, predicts)) combination = Combination(self.data) combination.separateData() combination.calculate() combination.train(alpha = 0.01, iter = 10000) answers, predicts = combination.predict(self.test) err = evaluationRMSE(answers, predicts) self.errs[4] = err print("Combination: %f" % err) return self.errs
parser.add_argument("filename", nargs="?", default="data/news_dataset.csv") parser.add_argument("--threads", "-j", default=1, type=int) return parser.parse_args() if __name__ == "__main__": getLogger().setLevel(DEBUG) args = parse_arguments() try: args = parse_arguments() if args.threads == 1: process = BaseProcess() process.run(args.filename) decomposition = SVD(process.tfidf, 100) else: manager = ManagerProcess(args.threads) manager.run(args.filename) decomposition = SVD(manager.tfidf, 100) startconvert = time() a = decomposition.create_numpy_matrices() a_sparse = decomposition.turn_sparse(a) endconvert = time() start = time() decomposition.calculate_eigenvalues(a_sparse) print(decomposition.eigenvaluesMTM) print(decomposition.eigenvaluesMMT) end = time()
import pandas as pd import dataset as dp from svd import SVD from svdpp import SVDpp print('--------------------Matrix factorization--------------------') train_data, test_data, data = dp.load_dataset() table = {} f = 10 model = SVD(train_data, f) model.train() table[f] = [model.test(test_data)] model = SVDpp(train_data, f) model.train() table[f].append(model.test(test_data)) f = 20 model = SVD(train_data, f) model.train() table[f] = [model.test(test_data)] model = SVDpp(train_data, f) model.train() table[f].append(model.test(test_data)) f = 50 model = SVD(train_data, f)
def generate_cur(self, mode): ''' This function will generate the C, U and R matrices by selecting Columns and rows based on the probabilities Call this function when using for the first time, to create a CUD decomposition. This will automatically update the C,U,R member elemets of the CUR object. ''' #Getting the data matrix data_matrix = (self.data_matrix).astype(np.float64) # data_matrix=np.array([[1,2,3],[4,5,6],[7,8,9],[10,11,12]],dtype=np.float64) #Calculating the probabilities for the columns. ColumnProb = [] denominator = np.sum(np.square(data_matrix)) for c in range(data_matrix.shape[1]): ColumnProb.append( np.sum(np.square(data_matrix[:, c])) / denominator) chosenColumns = np.random.choice( data_matrix.shape[1], int(math.floor(0.9 * data_matrix.shape[1])), False, ColumnProb) C_matrix = np.zeros(shape=(data_matrix.shape[0], chosenColumns.shape[0])) for i, col in enumerate(chosenColumns): C_matrix[:, i] = data_matrix[:, col] / math.sqrt( chosenColumns.shape[0] * ColumnProb[col]) RowProb = [] for r in range(data_matrix.shape[0]): RowProb.append(np.sum(np.square(data_matrix[r, :])) / denominator) chosenRows = np.random.choice( data_matrix.shape[0], int(math.floor(0.9 * data_matrix.shape[0])), False, RowProb) R_matrix = np.zeros(shape=(chosenRows.shape[0], data_matrix.shape[1])) for i, row in enumerate(chosenRows): R_matrix[i, :] = data_matrix[row, :] / math.sqrt( chosenRows.shape[0] * RowProb[row]) W = np.zeros(shape=(chosenRows.shape[0], chosenColumns.shape[0])) for i in range(chosenRows.shape[0]): for j in range(chosenColumns.shape[0]): W[i][j] = data_matrix[chosenRows[i]][chosenColumns[j]] svd = SVD(None, None, 'no_normalize', 'CUR', W) if mode == '90-percent': svd._set_90percent_energy_mode() sigma_inverse = [] for i in range(svd.sigma_vector.shape[0]): if (abs(svd.sigma_vector[i]) < 0.1): sigma_inverse.append(svd.sigma_vector[i]) else: sigma_inverse.append(1 / svd.sigma_vector[i]) Zplus = np.diag(sigma_inverse)**2 Wplus = svd.V_matrix.dot(Zplus.dot(svd.U_matrix.T)) self.C_matrix = C_matrix self.U_matrix = Wplus self.R_matrix = R_matrix self.reconstructed_matrix = C_matrix.dot(Wplus.dot(R_matrix)) print("Renormalizing the rating-matrix") self.reconstructed_matrix = self.reconstructed_matrix * self.user_var_vec + self.user_mean_vec non_zero_mask = self.data_matrix != 0 diff = (self.data_matrix - self.reconstructed_matrix) * non_zero_mask rmse_val = np.mean(np.square(diff))**(0.5) print(rmse_val)
from load_data import read_csv from svd import SVD from sklearn.cluster import MiniBatchKMeans import pandas as pd if __name__ == '__main__': rec_data = read_csv('../input/user_item_cnt.csv') rec = SVD(rec_data) rec.fit() score = rec.get_score() tmp = [dict(v, user_id=user_id) for user_id, aaa in score.items() for v in aaa] df = pd.DataFrame(tmp) df.head() df.to_csv('svd2.csv', index=False) """ model = MiniBatchKMeans(n_clusters=100, random_state=0) model.fit(rec.user_matrix) pred = model.predict(rec.user_matrix) users = [rec_data.map_idx2user[i] for i in range(len(rec_data.map_idx2user))] max(users) len(rec_data.map_idx2user) df = pd.DataFrame({'user_id': users, 'cluster': pred}) df.to_csv('cluster.csv', index=False) """
def reduce_dimension_function(option, X_train, new_dim): if option == 'pca': n_batches = 10 pca = PCA(n_components=new_dim) pca.fit(X_train) X_reduced = pca.transform(X_train) print(np.shape(X_reduced)) return X_reduced elif option == 'autoencoder': autoe = AUTOE() autoe.set_data(X_train) autoe.shuffle_data() # autoe.normalize(-1.0, 1.0) autoe.divide_data(0.8) autoe.create_autoencoder(new_dim) # autoe.normalize() # best results of clustering for interval [0, 1] # autoe.standardize() autoe.train_autoencoder() # autoe.test_autoencoder() # autoe.get_activations() autoe.sort_activations() # autoe.plot_reconstruction(i+1) # autoe.save_activations('caract_autoe.csv') # autoe.save_activations(filename+'_'+str(i+1)+'.csv') # autoe.save_activations('caract_autoe.csv') return autoe.get_activations() elif option == 'svd': svd = SVD() svd.set_data(X_train) # svd.load_data('dataset.csv') svd.shuffle_data() # svd.normalize(-1.0,1.0) # svd.standardize() svd.run_svd(new_dim) svd.sort_coefficients() # svd.save_activations('caract_'+svd.__class__.__name__.lower()+'60.csv') # svd.save_activations(filename+'_'+str(i+1)+'.csv') return svd.get_coefficients() elif option == 'cp': cp = CP() cp.set_data(X_train) # cp.load_data('dataset.csv') cp.shuffle_data() # cp.normalize(-1.0, 1.0) # cp.standardize() cp.execute_cp(new_dim) cp.sort_coefficients() # cp.save_activations(filename+'_'+str(i+1)+'.csv') # cp.save_activations('caract_cp.csv') return cp.get_coefficients() elif option == 'dct': dcost = DCT() dcost.set_data(X_train) dcost.shuffle_data() # dcost.normalize(-1.0, 1.0) dcost.execute_dct(new_dim) dcost.sort_coefficients() # dcost.save_activations(filename+'_'+str(i+1)+'.csv') # dcost.save_activations('caract_dct.csv') return dcost.get_coefficients() elif option == 'dwt': dwt = DWT() dwt.set_data(X_train) dwt.shuffle_data() # dwt.normalize(-1,1) # dwt.standardize() dwt.execute_dwt(new_dim) dwt.sort_coefficients() return dwt.get_coefficients() elif option == 'ipla': paa = IPLA() paa.set_data(X_train) # paa.load_data('dataset.csv') paa.shuffle_data() # paa.normalize() # paa.standardize() paa.execute_ipla(new_dim) paa.sort_coefficients() return paa.get_coefficients() elif option == 'paa': paa = PAA() paa.set_data(X_train) # paa.load_data('dataset.csv') paa.shuffle_data() # paa.normalize(-1, 1) # paa.standardize() paa.execute_paa(new_dim) paa.sort_coefficients() return paa.get_coefficients() elif option == 'sax': sax = SAX() sax.set_data(X_train) sax.shuffle_data() # sax.normalize() # sax.standardize() sax.execute_sax(new_dim) sax.sort_coefficients() return sax.get_coefficients() else: return 'Invalid option'
#!/usr/bin/python from clip import * from svd import SVD from optparse import OptionParser if __name__ == '__main__': parser = OptionParser() parser.add_option("-s", type="float", dest="start_time", default=0) parser.add_option("-f", type="float", dest="end_time", default=10) (options, args) = parser.parse_args() if len(args) != 2: print "Usage: script input.wav output.pickle" exit(0) c = Clip(args[0], int(options.start_time * 44100), int(44100 * options.end_time)) s = Spectrogram(c) svd = SVD(spectrogram=s) svd.save(args[1])
def __init__(self, data, k=-1, rrank=0, crank=0): SVD.__init__(self, data, k=k, rrank=rrank, crank=rrank)
#!/usr/bin/python from clip import * from svd import SVD from optparse import OptionParser if __name__ == '__main__': parser = OptionParser() parser.add_option("-s", type="float", dest="start_time", default=0) parser.add_option("-f", type="float", dest="end_time", default=10) (options, args) = parser.parse_args() if len(args)!=2: print "Usage: script input.wav output.pickle" exit(0) c = Clip(args[0], int(options.start_time * 44100), int(44100 * options.end_time)) s = Spectrogram(c) svd = SVD(spectrogram=s) svd.save(args[1])
#!/usr/bin/python from clip import * from svd import SVD from optparse import OptionParser import sys if __name__ == '__main__': svd = SVD(filename=sys.argv[1]) k = [] for arg in sys.argv[2:]: if ':' in arg: i = arg.index(':') s = arg[:i] f = arg[i+1:] k += range(int(s), int(f)) else: k.append(int(arg)) svd.mask(k) s = svd.reconstruct() c2 = s.resynthesize() c2.write('reconstruction.wav')
if word not in stop_words ] words_list.union(set(query)) words_list = list(words_list) words_list.sort() term_freq = [] query_freq = [] for word in words_list: doc_freq = [] for i in range(len(docs)): doc_freq.append(docs[i].count(word)) term_freq.append(doc_freq) query_freq.append(query.count(word)) U, S, V = SVD(term_freq) # use the SVD function created in svd.py # dimension approximation # for k - dimension approximation change 2 to k # for 2 - dimension approximation k = 2 Uk = U[:, 0:k] # first two coloumns of U and all rows Sk = S[0:k, 0:k] # sub matrix with first two columns and rows Vk = V[:, 0: k] # first two columns of V and all rows each row is the vector of doc(i) SkI = linAlgs.inv(Sk) Q = np.array(query_freq) Q = np.dot(Q.transpose(), Uk) Q = np.dot(Q, SkI) print(Q) print(Vk)