Ejemplo n.º 1
0
 def pca(self):
     svd = SVD(arr)
     u, s, vt = svd.svd()
     variances = np.diag(s)**2
     variances_sum = sum(variances)
     self.explained_var = variances / variances_sum
     self.loadings = u.T
Ejemplo n.º 2
0
def worker(fold, n_users, n_items, dataset_dir):
    traFilePath = dataset_dir + 'ratings__' + str(fold + 1) + '_tra.txt'
    trasR = loadSparseR(n_users, n_items, traFilePath)

    print(
        dataset_dir.split('/')[-2] + ':', trasR.shape, trasR.nnz,
        '%.2f' % (trasR.nnz / float(trasR.shape[0])))

    tra_tuple = np.array([(user, item, trasR[user, item])
                          for user, item in np.asarray(trasR.nonzero()).T
                          ])  # triad

    tstFilePath = dataset_dir + 'ratings__' + str(fold + 1) + '_tst.txt'
    tstsR = loadSparseR(n_users, n_items, tstFilePath)
    tst_tuple = np.array([(user, item, tstsR[user, item])
                          for user, item in np.asarray(tstsR.nonzero()).T
                          ])  # triad

    sampler = Sampler(trasR=trasR, negRatio=.0, batch_size=batch_size)
    svd = SVD(n_users, n_items, eval_metrics, range_of_ratings, reg, n_factors,
              batch_size)
    scores = svd.train(fold + 1, tra_tuple, tst_tuple, sampler)

    print('fold=%d:' % fold,
          ','.join(['%s' % eval_metric for eval_metric in eval_metrics]), '=',
          ','.join(['%.6f' % (score) for score in scores]))

    return scores
Ejemplo n.º 3
0
 def __init__(self, data, k=-1, rrank=0, crank=0):
     SVD.__init__(self, data,k=k,rrank=rrank, crank=rrank)
     
     # select all data samples for computing the error:
     # note that this might take very long, adjust self._rset and self._cset 
     # for faster computations.
     self._rset = range(self._rows)
     self._cset = range(self._cols) 
Ejemplo n.º 4
0
    def __init__(self, data, k=-1, rrank=0, crank=0):
        SVD.__init__(self, data, k=k, rrank=rrank, crank=rrank)

        # select all data samples for computing the error:
        # note that this might take very long, adjust self._rset and self._cset
        # for faster computations.
        self._rset = range(self._rows)
        self._cset = range(self._cols)
Ejemplo n.º 5
0
    def __init__(self, data, rrank=0, crank=0, show_progress=True):
        SVD.__init__(self,
                     data,
                     rrank=rrank,
                     crank=rrank,
                     show_progress=show_progress)

        # select all data samples for computing the error:
        # note that this might take very long, adjust self._rset and self._cset for
        # faster computations.
        self._rset = range(self._rows)
        self._cset = range(self._cols)
Ejemplo n.º 6
0
    def _update_w(self):
        svd_mdl = SVD(self.data)
        svd_mdl.factorize()

        U, S, V = svd_mdl.U, svd_mdl.S, svd_mdl.V

        # The first left singular vector is nonnegative
        # (abs is only used as values could be all negative)
        self.W[:, 0] = np.sqrt(S[0, 0]) * np.abs(U[:, 0])

        #The first right singular vector is nonnegative
        self.H[0, :] = np.sqrt(S[0, 0]) * np.abs(V[0, :].T)

        for i in range(1, self._num_bases):
            # Form the rank one factor
            Tmp = np.dot(U[:, i:i + 1] * S[i, i], V[i:i + 1, :])

            # zero out the negative elements
            Tmp = np.where(Tmp < 0, 0.0, Tmp)

            # Apply 2nd SVD
            svd_mdl_2 = SVD(Tmp)
            svd_mdl_2.factorize()
            u, s, v = svd_mdl_2.U, svd_mdl_2.S, svd_mdl_2.V

            # The first left singular vector is nonnegative
            self.W[:, i] = np.sqrt(s[0, 0]) * np.abs(u[:, 0])

            #The first right singular vector is nonnegative
            self.H[i, :] = np.sqrt(s[0, 0]) * np.abs(v[0, :].T)
Ejemplo n.º 7
0
def main(movies_path, scores_path, db_config_path='../config.ini'):

    movies_reindexed_path = movies_path + "_reindexed"
    scores_reindexed_path = scores_path + "_reindexed"
    Utils.reindex_movies(movies_path, scores_path, movies_reindexed_path,
                         scores_reindexed_path)

    f = open(scores_reindexed_path, 'r')

    scores = {}

    for line in f:
        line_split = line.split(',')
        user_id = int(line_split[0])
        movie_id = int(line_split[1])
        score = float(line_split[2])

        scores[user_id, movie_id] = score

    f.close()

    svd = SVD()
    svd.train(scores)
    similarities = svd.find_top_similar_items()

    db_config = ConfigParser()
    db_config.read(db_config_path)
    similarities_dao = SimilaritiesDao({
        "host":
        db_config.get("mysql", "host"),
        "user":
        db_config.get("mysql", "user"),
        "passwd":
        db_config.get("mysql", "passwd"),
        "db":
        db_config.get("mysql", "db")
    })

    movies = Utils.reindexed_movie_titles(movies_reindexed_path)
    normalized_movies = {}
    for (movie_id, title) in enumerate(movies):
        normalized_movies[Utils.normalize_title(title)] = movie_id

    similarities_dao.store_movies(movies)
    similarities_dao.store_normalized_movies(normalized_movies)
    similarities_dao.store_similarities(similarities)

    similarities_dao.close()
Ejemplo n.º 8
0
    def update_w(self):
        # compute eigenvectors and eigenvalues using SVD            
        svd_mdl = SVD(self.data)
        svd_mdl.factorize()
            
        # argsort sorts in ascending order -> do reverese indexing
        # for accesing values in descending order    
        S = np.diag(svd_mdl.S)
        order = np.argsort(S)[::-1]

        # select only a few eigenvectors  ...
        if self._num_bases >0:
            order = order[:self._num_bases]
    
        self.W = svd_mdl.U[:,order]
        self.eigenvalues =  S[order]               
Ejemplo n.º 9
0
    def _update_w(self):
        # compute eigenvectors and eigenvalues using SVD            
        svd_mdl = SVD(self.data)
        svd_mdl.factorize()
            
        # argsort sorts in ascending order -> do reverese indexing
        # for accesing values in descending order    
        S = np.diag(svd_mdl.S)
        order = np.argsort(S)[::-1]

        # select only a few eigenvectors  ...
        if self._num_bases >0:
            order = order[:self._num_bases]
    
        self.W = svd_mdl.U[:,order]
        self.eigenvalues =  S[order]               
Ejemplo n.º 10
0
def main(args):
    import  optparse
    parser = optparse.OptionParser()
    parser.usage = __doc__
    parser.add_option("-q", "--quiet",
                      action="store_false", dest="verbose", default=True,
                      help="don't print status messages to stdout")
    parser.add_option("-l", "--load",
                      action="store_true", dest="load",
                      help="load from a cache file")
    parser.add_option("-c", "--cache",
                      action="store_true", dest="cache",
                      help="build a cache of data")
    parser.add_option("-f", "--features",
                      action="store", type=int, dest="nFeatures", default=10,
                      help="user nfeatures")
    parser.add_option("-e", "--epochs",
                      action="store", type=int, dest="nepochs", default=10,
                      help="train through nepochs")
    parser.add_option("-s", "--slow",
                      action="store_true", dest="slow",
                      help="use non cython model (probably will be obsolete")
    parser.add_option("-S", "--size",
                      action="store_true", dest="getsize", default=False,
                      help="print np.ndarray sizes and exit")
    parser.add_option("-p", "--nprocs",
                      action="store", dest="nprocs", type=int, default=1,
                      help="run in threaded mode",)
    (options, args) = parser.parse_args()
    if len(args) < 1:
        parser.error("Not enough arguments given")


    if options.load:
        svd = SVD.load(args[0])
    else:
        svd = SVD(args[0], options.nFeatures)

    if options.cache:
        svd.dump()
    elif options.getsize:
        size = svd.getsize()
        print "%d bytes, %dMB" % (size, size / 2.**20)
    else:
        if options.nprocs == 1:
            svd.train_all(options.nepochs)
        else:
            import mpsvd
            mpsvd.runManySVDs(args[0], options.nprocs, options.nepochs)
        svd.validate()
    return 0
Ejemplo n.º 11
0
    def U_calc(self):
        """generates the u matrix with the help of intersection of the C and R matrix found by 
		   selecting rows and columns randomly with a certaing probability associated with the 
		   					selection of that row or column"""
        self.ucal = self.R[:, self.c]
        svd = SVD()
        svd.Ucalc(self.ucal, self.ucal.transpose())
        svd.Vcalc(self.ucal, self.ucal.transpose())
        svd.Sigma()

        sigma = svd.sigma

        for i in range(0, max(svd.rank_u, svd.rank_v)):
            if sigma[i, i] != 0:
                sigma[i, i] = (1 / sigma[i, i])

        self.U = (svd.V.transpose()) * (sigma) * (sigma) * (svd.U.transpose())
    def calculate(self):
        self.allPredicts = np.zeros((4, self.testSize))

        bias = Bias(self.trainData, self.testData)
        bias.calculateBias()
        answers, predicts = bias.predict()
        self.biasClass = bias
        self.allPredicts[0, :] = predicts
        #print("Bias: %f" % evaluationRMSE(answers, predicts))

        similarity = Similarity(self.trainData, self.testData)
        similarity.calculateBias()
        similarity.calcSimiMatrix()
        answers, predicts = similarity.predict()
        self.similarityClass = similarity
        self.allPredicts[1, :] = predicts
        #print("Similarity: %f" % evaluationRMSE(answers, predicts))

        svd = SVD(self.trainData, self.testData)
        svd.generaterMat()
        svd.calcSVD()
        answers, predicts = svd.predict()
        self.svdClass = svd
        self.allPredicts[2, :] = predicts
        #print("SVD: %f" % evaluationRMSE(answers, predicts))

        matFactory = MatFactory(self.trainData, self.testData)
        matFactory.train(10, 11)
        answers, predicts = matFactory.predict()
        self.matFactoryClass = matFactory
        self.allPredicts[3, :] = predicts
        #print("MatFactory: %f" % evaluationRMSE(answers, predicts))

        pickleFile = open(predictsFile, 'wb')
        pickle.dump(self.allPredicts, pickleFile)
Ejemplo n.º 13
0
def load_svd(file):
    """
        Load the SVD file

        Load a SVD file. If the file name is a valid path, the function will
        load the file. Otherwise, the function will try to get the SVD file
        from package. If the function can't find a SVD file, this raises a
        FileNotFoundError error.

        :param file: The name of the file to open to load the SVD
    """
    try:
        if os.path.exists(file):
            svd = SVD(file)
        else:
            svd = SVDText(open_resource(None, file).read())
        svd.parse()
        return svd
    except OSError:
        raise FileNotFoundError
Ejemplo n.º 14
0
        def comp_prob(d, k):
            # compute statistical leverage score
            c = np.round(k - k / 5.0)

            svd_mdl = SVD(d, k=c)
            svd_mdl.factorize()

            if scipy.sparse.issparse(self.data):
                A = svd_mdl.V.multiply(svd_mdl.V)
                ## Rule 1
                pcol = np.array(A.sum(axis=0) / k)
            else:
                A = svd_mdl.V[:k, :]**2.0
                ## Rule 1
                pcol = A.sum(axis=0) / k

            #c = k * np.log(k/ (self._eps**2.0))
            #pcol = c * pcol.reshape((-1,1))
            pcol /= np.sum(pcol)
            return pcol
Ejemplo n.º 15
0
 def comp_prob(d, k):           
     # compute statistical leverage score     
     c = np.round(k - k/5.0)
 
     svd_mdl = SVD(d, k=c)
     svd_mdl.factorize()
     
     if scipy.sparse.issparse(self.data):
         A = svd_mdl.V.multiply(svd_mdl.V)           
         ## Rule 1
         pcol = np.array(A.sum(axis=0)/k)                                    
     else:
         A = svd_mdl.V[:k,:]**2.0         
         ## Rule 1
         pcol = A.sum(axis=0)/k            
         
     #c = k * np.log(k/ (self._eps**2.0))
     #pcol = c * pcol.reshape((-1,1)) 
     pcol /= np.sum(pcol)                     
     return pcol
Ejemplo n.º 16
0
    def calculate(self):
        self.allPredicts = np.zeros((4, self.testSize))

        bias = Bias(self.trainData, self.testData)
        bias.calculateBias()
        answers, predicts = bias.predict()
        self.biasClass = bias
        self.allPredicts[0, :] = predicts
        #print("Bias: %f" % evaluationRMSE(answers, predicts))

        similarity = Similarity(self.trainData, self.testData)
        similarity.calculateBias()
        similarity.calcSimiMatrix()
        answers, predicts = similarity.predict()
        self.similarityClass = similarity
        self.allPredicts[1, :] = predicts
        #print("Similarity: %f" % evaluationRMSE(answers, predicts))

        svd = SVD(self.trainData, self.testData)
        svd.generaterMat()
        svd.calcSVD()
        answers, predicts = svd.predict()
        self.svdClass = svd
        self.allPredicts[2, :] = predicts
        #print("SVD: %f" % evaluationRMSE(answers, predicts))

        matFactory = MatFactory(self.trainData, self.testData)
        matFactory.train(10, 11)
        answers, predicts = matFactory.predict()
        self.matFactoryClass = matFactory
        self.allPredicts[3, :] = predicts
        #print("MatFactory: %f" % evaluationRMSE(answers, predicts))

        pickleFile = open(predictsFile, 'wb')
        pickle.dump(self.allPredicts, pickleFile)
Ejemplo n.º 17
0
    def _update_w(self):
        svd_mdl = SVD(self.data)
        svd_mdl.factorize()
        
        U, S, V = svd_mdl.U, svd_mdl.S, svd_mdl.V    
        
        # The first left singular vector is nonnegative
        # (abs is only used as values could be all negative)
        self.W[:,0] = np.sqrt(S[0,0]) * np.abs(U[:,0])
        
        #The first right singular vector is nonnegative
        self.H[0,:] = np.sqrt(S[0,0]) * np.abs(V[0,:].T)

        for i in range(1,self._num_bases):
            # Form the rank one factor
            Tmp = np.dot(U[:,i:i+1]*S[i,i], V[i:i+1,:])          
            
            # zero out the negative elements
            Tmp = np.where(Tmp < 0, 0.0, Tmp)
            
            # Apply 2nd SVD
            svd_mdl_2 = SVD(Tmp)
            svd_mdl_2.factorize()
            u, s, v = svd_mdl_2.U, svd_mdl_2.S, svd_mdl_2.V
            
            # The first left singular vector is nonnegative
            self.W[:,i] = np.sqrt(s[0,0]) * np.abs(u[:,0]) 
            
            #The first right singular vector is nonnegative
            self.H[i,:] = np.sqrt(s[0,0]) * np.abs(v[0,:].T) 
Ejemplo n.º 18
0
def main(movies_path, scores_path, db_config_path='../config.ini'):

    movies_reindexed_path = movies_path + "_reindexed"
    scores_reindexed_path = scores_path + "_reindexed"
    Utils.reindex_movies(movies_path, scores_path, movies_reindexed_path, scores_reindexed_path)

    f = open(scores_reindexed_path, 'r')

    scores = {}

    for line in f:
        line_split = line.split(',')
        user_id = int(line_split[0])
        movie_id = int(line_split[1])
        score = float(line_split[2])

        scores[user_id, movie_id] = score

    f.close()

    svd = SVD()
    svd.train(scores)
    similarities = svd.find_top_similar_items()

    db_config = ConfigParser()
    db_config.read(db_config_path)
    similarities_dao = SimilaritiesDao({"host": db_config.get("mysql", "host"), "user": db_config.get("mysql", "user"),
                                      "passwd": db_config.get("mysql", "passwd"), "db": db_config.get("mysql", "db")})

    movies = Utils.reindexed_movie_titles(movies_reindexed_path)
    normalized_movies = {}
    for (movie_id, title) in enumerate(movies):
        normalized_movies[Utils.normalize_title(title)] = movie_id

    similarities_dao.store_movies(movies)
    similarities_dao.store_normalized_movies(normalized_movies)
    similarities_dao.store_similarities(similarities)

    similarities_dao.close()
Ejemplo n.º 19
0
def main(args):
    summarizer = {
        'tfidf': TfIdf(),
        'cluster': Cluster(),
        'svd': SVD(),
        'pagerank': PageRank()
    }[args['alg']]

    summarizer.initialize(args['tf'], args['df'])
    summary = summarizer.summarize(args['doc'])

    for s in summary:
        print(s),
Ejemplo n.º 20
0
    def calAll(self):
        self.errs = [0] * 5
        bias = Bias(self.data, self.test)
        bias.calculateBias()
        answers, predicts = bias.predict()
        err = evaluationRMSE(answers, predicts)
        self.errs[0] = err
        print("Bias: %f" % err)

        similarity = Similarity(self.data, self.test)
        similarity.calculateBias()
        similarity.calcSimiMatrix()
        answers, predicts = similarity.predict()
        err = evaluationRMSE(answers, predicts)
        self.errs[1] = err
        print("Similarity: %f" % err)

        svd = SVD(self.data, self.test)
        svd.generaterMat()
        svd.calcSVD()
        answers, predicts = svd.predict()
        err = evaluationRMSE(answers, predicts)
        self.errs[2] = err
        print("SVD: %f" % err)

        matFactory = MatFactory(self.data, self.test)
        matFactory.train(20, 35)
        answers, predicts = matFactory.predict()
        err = evaluationRMSE(answers, predicts)
        self.errs[3] = err
        print("MatFactory: %f" % evaluationRMSE(answers, predicts))

        combination = Combination(self.data)
        combination.separateData()
        combination.calculate()
        combination.train(alpha=0.01, iter=10000)
        answers, predicts = combination.predict(self.test)
        err = evaluationRMSE(answers, predicts)
        self.errs[4] = err
        print("Combination: %f" % err)
        return self.errs
Ejemplo n.º 21
0
def run_single_comparison(df_dict):
    X = []
    GM_Y = []
    SVD_Y = []

    for df in df_dict:
        X.append(df_dict[df].columns.size)
        
        gm_start = time.time()
        Gaussian_Mixture(df_dict[df])
        gm_time = time.time() - gm_start
        GM_Y.append(gm_time)
        #print("\nTime to run GM: on size ", df_dict[df].columns.size, ": ", gm_time, " seconds")

        svd_start = time.time()
        SVD(0,10,df_dict[df])
        svd_time = time.time() - svd_start
        SVD_Y.append(svd_time)
        #print("Time to run SVD on size ", df_dict[df].columns.size, ": ", svd_time, " seconds\n")

        print(df_dict[df].columns.size, " calculated.")
        
    return X, GM_Y, SVD_Y
Ejemplo n.º 22
0
    def on_enter(self):
        # Update user ratings
        global visited_p1, visited_p2
        visited_p2 = 1
        if visited_p1 == 0:
            for x in initializeRatings:
                userRatings.append(float(initializeRatings.get(x)))
            data.iloc[-1] = userRatings
        user_likes = get_likes(data)
        last_user = list(user_likes.keys())[-1]
        u_likes = user_likes[last_user]
        m_likes = []
        recommendations_l = []

        count = 0
        for m in u_likes:
            # stop at 10 movies
            if count == 10:
                break
            # find the  movie index number
            m_index = data.columns.get_loc(m)
            print(m_index)
            start = time.time()
            similar_movies = SVD(m_index, 10, data)
            print("Time to calculate SVD: ", time.time() - start, " seconds.")

            m_likes.extend(similar_movies)
            #print(m_likes)
            count += 1
        recommendations_l = [
            movie for movie, count in Counter(m_likes).most_common(15)
        ]

        #return page layout to display
        layout = GridLayout(cols=4)
        new_layout = getlayout(layout, recommendations_l, 'SVD')
        self.add_widget(new_layout)
Ejemplo n.º 23
0
def run_multi_comparison(dataFrames):
    interval = int(40/(len(dataFrames) - 1))
    X = []
    GM_Y = []
    SVD_Y = []
    GM_time = 0
    for i in range(len(dataFrames)):
        if i == 0:
            GM_start = time.time()
            Gaussian_Mixture(dataFrames[i])
            GM_time = time.time() - GM_start
        else:
            GM_Y.append(GM_time)
            X.append(i * interval)
            SVD_start = time.time()

            for j in range(i * interval):
                SVD(j,10,dataFrames[i])
                
            SVD_time = time.time() - SVD_start
            SVD_Y.append(SVD_time)
        print(i * interval, " calculated.")

    return X, GM_Y, SVD_Y
Ejemplo n.º 24
0
    def calAll(self):
        self.errs = [0] * 5
        bias = Bias(self.data, self.test)
        bias.calculateBias()
        answers, predicts = bias.predict()
        err = evaluationRMSE(answers, predicts)
        self.errs[0] = err
        print("Bias: %f" % err)

        similarity = Similarity(self.data, self.test)
        similarity.calculateBias()
        similarity.calcSimiMatrix()
        answers, predicts = similarity.predict()
        err = evaluationRMSE(answers, predicts)
        self.errs[1] = err
        print("Similarity: %f" % err)

        svd = SVD(self.data, self.test)
        svd.generaterMat()
        svd.calcSVD()
        answers, predicts = svd.predict()
        err = evaluationRMSE(answers, predicts)
        self.errs[2] = err
        print("SVD: %f" % err)

        matFactory = MatFactory(self.data, self.test)
        matFactory.train(20, 35)
        answers, predicts = matFactory.predict()
        err = evaluationRMSE(answers, predicts)
        self.errs[3] = err
        print("MatFactory: %f" % evaluationRMSE(answers, predicts))

        combination = Combination(self.data)
        combination.separateData()
        combination.calculate()
        combination.train(alpha = 0.01, iter = 10000)
        answers, predicts = combination.predict(self.test)
        err = evaluationRMSE(answers, predicts)
        self.errs[4] = err
        print("Combination: %f" % err)
        return self.errs
Ejemplo n.º 25
0
    parser.add_argument("filename", nargs="?", default="data/news_dataset.csv")
    parser.add_argument("--threads", "-j", default=1, type=int)

    return parser.parse_args()


if __name__ == "__main__":
    getLogger().setLevel(DEBUG)
    args = parse_arguments()

    try:
        args = parse_arguments()
        if args.threads == 1:
            process = BaseProcess()
            process.run(args.filename)
            decomposition = SVD(process.tfidf, 100)
        else:
            manager = ManagerProcess(args.threads)
            manager.run(args.filename)
            decomposition = SVD(manager.tfidf, 100)

        startconvert = time()
        a = decomposition.create_numpy_matrices()
        a_sparse = decomposition.turn_sparse(a)
        endconvert = time()

        start = time()
        decomposition.calculate_eigenvalues(a_sparse)
        print(decomposition.eigenvaluesMTM)
        print(decomposition.eigenvaluesMMT)
        end = time()
Ejemplo n.º 26
0
import pandas as pd

import dataset as dp
from svd import SVD
from svdpp import SVDpp

print('--------------------Matrix factorization--------------------')
train_data, test_data, data = dp.load_dataset()

table = {}

f = 10
model = SVD(train_data, f)
model.train()
table[f] = [model.test(test_data)]

model = SVDpp(train_data, f)
model.train()
table[f].append(model.test(test_data))

f = 20
model = SVD(train_data, f)
model.train()
table[f] = [model.test(test_data)]

model = SVDpp(train_data, f)
model.train()
table[f].append(model.test(test_data))

f = 50
model = SVD(train_data, f)
Ejemplo n.º 27
0
    def generate_cur(self, mode):
        '''
        This function will generate the C, U and R matrices by
        selecting Columns and rows based on the probabilities

        Call this function when using for the first time,
        to create a CUD decomposition. This will automatically
        update the C,U,R member elemets of the CUR object.
        '''
        #Getting the data matrix
        data_matrix = (self.data_matrix).astype(np.float64)
        # data_matrix=np.array([[1,2,3],[4,5,6],[7,8,9],[10,11,12]],dtype=np.float64)

        #Calculating the probabilities for the columns.
        ColumnProb = []
        denominator = np.sum(np.square(data_matrix))
        for c in range(data_matrix.shape[1]):
            ColumnProb.append(
                np.sum(np.square(data_matrix[:, c])) / denominator)
        chosenColumns = np.random.choice(
            data_matrix.shape[1], int(math.floor(0.9 * data_matrix.shape[1])),
            False, ColumnProb)

        C_matrix = np.zeros(shape=(data_matrix.shape[0],
                                   chosenColumns.shape[0]))
        for i, col in enumerate(chosenColumns):
            C_matrix[:, i] = data_matrix[:, col] / math.sqrt(
                chosenColumns.shape[0] * ColumnProb[col])

        RowProb = []
        for r in range(data_matrix.shape[0]):
            RowProb.append(np.sum(np.square(data_matrix[r, :])) / denominator)
        chosenRows = np.random.choice(
            data_matrix.shape[0], int(math.floor(0.9 * data_matrix.shape[0])),
            False, RowProb)

        R_matrix = np.zeros(shape=(chosenRows.shape[0], data_matrix.shape[1]))
        for i, row in enumerate(chosenRows):
            R_matrix[i, :] = data_matrix[row, :] / math.sqrt(
                chosenRows.shape[0] * RowProb[row])

        W = np.zeros(shape=(chosenRows.shape[0], chosenColumns.shape[0]))
        for i in range(chosenRows.shape[0]):
            for j in range(chosenColumns.shape[0]):
                W[i][j] = data_matrix[chosenRows[i]][chosenColumns[j]]

        svd = SVD(None, None, 'no_normalize', 'CUR', W)
        if mode == '90-percent':
            svd._set_90percent_energy_mode()
        sigma_inverse = []
        for i in range(svd.sigma_vector.shape[0]):
            if (abs(svd.sigma_vector[i]) < 0.1):
                sigma_inverse.append(svd.sigma_vector[i])
            else:
                sigma_inverse.append(1 / svd.sigma_vector[i])
        Zplus = np.diag(sigma_inverse)**2
        Wplus = svd.V_matrix.dot(Zplus.dot(svd.U_matrix.T))

        self.C_matrix = C_matrix
        self.U_matrix = Wplus
        self.R_matrix = R_matrix

        self.reconstructed_matrix = C_matrix.dot(Wplus.dot(R_matrix))
        print("Renormalizing the rating-matrix")
        self.reconstructed_matrix = self.reconstructed_matrix * self.user_var_vec + self.user_mean_vec

        non_zero_mask = self.data_matrix != 0
        diff = (self.data_matrix - self.reconstructed_matrix) * non_zero_mask
        rmse_val = np.mean(np.square(diff))**(0.5)
        print(rmse_val)
Ejemplo n.º 28
0
from load_data import read_csv
from svd import SVD
from sklearn.cluster import MiniBatchKMeans
import pandas as pd
if __name__ == '__main__':
    rec_data = read_csv('../input/user_item_cnt.csv')
    rec = SVD(rec_data)
    rec.fit()

    score = rec.get_score()
    tmp = [dict(v, user_id=user_id) for user_id, aaa in score.items() for v in aaa]
    df = pd.DataFrame(tmp)
    df.head()
    df.to_csv('svd2.csv', index=False)
    """
    model = MiniBatchKMeans(n_clusters=100, random_state=0)
    model.fit(rec.user_matrix)
    pred = model.predict(rec.user_matrix)

    users = [rec_data.map_idx2user[i] for i in range(len(rec_data.map_idx2user))]
    max(users)
    len(rec_data.map_idx2user)

    df = pd.DataFrame({'user_id': users, 'cluster': pred})
    df.to_csv('cluster.csv', index=False)
    """
Ejemplo n.º 29
0
def reduce_dimension_function(option, X_train, new_dim):

    if option == 'pca':
        n_batches = 10
        pca = PCA(n_components=new_dim)
        pca.fit(X_train)
        X_reduced = pca.transform(X_train)
        print(np.shape(X_reduced))
        return X_reduced

    elif option == 'autoencoder':
        autoe = AUTOE()
        autoe.set_data(X_train)
        autoe.shuffle_data()
        # autoe.normalize(-1.0, 1.0)
        autoe.divide_data(0.8)
        autoe.create_autoencoder(new_dim)
        # autoe.normalize() # best results of clustering for interval [0, 1]
        # autoe.standardize()
        autoe.train_autoencoder()
        # autoe.test_autoencoder()
        # autoe.get_activations()
        autoe.sort_activations()

        # autoe.plot_reconstruction(i+1)
        # autoe.save_activations('caract_autoe.csv')
        # autoe.save_activations(filename+'_'+str(i+1)+'.csv')
        # autoe.save_activations('caract_autoe.csv')
        return autoe.get_activations()

    elif option == 'svd':
        svd = SVD()
        svd.set_data(X_train)
        # svd.load_data('dataset.csv')
        svd.shuffle_data()
        # svd.normalize(-1.0,1.0)
        # svd.standardize()
        svd.run_svd(new_dim)
        svd.sort_coefficients()
        # svd.save_activations('caract_'+svd.__class__.__name__.lower()+'60.csv')
        # svd.save_activations(filename+'_'+str(i+1)+'.csv')
        return svd.get_coefficients()

    elif option == 'cp':
        cp = CP()
        cp.set_data(X_train)
        # cp.load_data('dataset.csv')
        cp.shuffle_data()
        # cp.normalize(-1.0, 1.0)
        # cp.standardize()
        cp.execute_cp(new_dim)
        cp.sort_coefficients()
        # cp.save_activations(filename+'_'+str(i+1)+'.csv')
        # cp.save_activations('caract_cp.csv')
        return cp.get_coefficients()

    elif option == 'dct':
        dcost = DCT()
        dcost.set_data(X_train)
        dcost.shuffle_data()
        # dcost.normalize(-1.0, 1.0)
        dcost.execute_dct(new_dim)
        dcost.sort_coefficients()
        # dcost.save_activations(filename+'_'+str(i+1)+'.csv')
        # dcost.save_activations('caract_dct.csv')
        return dcost.get_coefficients()

    elif option == 'dwt':
        dwt = DWT()
        dwt.set_data(X_train)
        dwt.shuffle_data()
        # dwt.normalize(-1,1)
        # dwt.standardize()
        dwt.execute_dwt(new_dim)
        dwt.sort_coefficients()
        return dwt.get_coefficients()

    elif option == 'ipla':
        paa = IPLA()
        paa.set_data(X_train)
        # paa.load_data('dataset.csv')
        paa.shuffle_data()
        # paa.normalize()
        # paa.standardize()
        paa.execute_ipla(new_dim)
        paa.sort_coefficients()
        return paa.get_coefficients()

    elif option == 'paa':
        paa = PAA()
        paa.set_data(X_train)
        # paa.load_data('dataset.csv')
        paa.shuffle_data()
        # paa.normalize(-1, 1)
        # paa.standardize()
        paa.execute_paa(new_dim)
        paa.sort_coefficients()
        return paa.get_coefficients()

    elif option == 'sax':
        sax = SAX()
        sax.set_data(X_train)
        sax.shuffle_data()
        # sax.normalize()
        # sax.standardize()
        sax.execute_sax(new_dim)
        sax.sort_coefficients()

        return sax.get_coefficients()

    else:
        return 'Invalid option'
Ejemplo n.º 30
0
#!/usr/bin/python

from clip import *
from svd import SVD
from optparse import OptionParser

if __name__ == '__main__':
    parser = OptionParser()
    parser.add_option("-s", type="float", dest="start_time", default=0)
    parser.add_option("-f", type="float", dest="end_time", default=10)
    (options, args) = parser.parse_args()

    if len(args) != 2:
        print "Usage: script input.wav output.pickle"
        exit(0)

    c = Clip(args[0], int(options.start_time * 44100),
             int(44100 * options.end_time))
    s = Spectrogram(c)
    svd = SVD(spectrogram=s)
    svd.save(args[1])
Ejemplo n.º 31
0
 def __init__(self, data, k=-1, rrank=0, crank=0):
     SVD.__init__(self, data, k=k, rrank=rrank, crank=rrank)
Ejemplo n.º 32
0
 def __init__(self, data, k=-1, rrank=0, crank=0):
     SVD.__init__(self, data, k=k, rrank=rrank, crank=rrank)
#!/usr/bin/python

from clip import *
from svd import SVD
from optparse import OptionParser

if __name__ == '__main__':
    parser = OptionParser()
    parser.add_option("-s", type="float", dest="start_time", default=0)
    parser.add_option("-f", type="float", dest="end_time", default=10)
    (options, args) = parser.parse_args()

    if len(args)!=2:
        print "Usage: script input.wav output.pickle"
        exit(0)

    c = Clip(args[0], int(options.start_time * 44100), int(44100 * options.end_time))
    s = Spectrogram(c)
    svd = SVD(spectrogram=s)
    svd.save(args[1])
#!/usr/bin/python

from clip import *
from svd import SVD
from optparse import OptionParser
import sys
if __name__ == '__main__':
    svd = SVD(filename=sys.argv[1])
    k = []

    for arg in sys.argv[2:]:
        if ':' in arg:
            i = arg.index(':')
            s = arg[:i]
            f = arg[i+1:]
            k += range(int(s), int(f))
        else:
            k.append(int(arg))

    svd.mask(k)
    s = svd.reconstruct()
    c2 = s.resynthesize()

    c2.write('reconstruction.wav')
Ejemplo n.º 35
0
    if word not in stop_words
]
words_list.union(set(query))

words_list = list(words_list)
words_list.sort()
term_freq = []
query_freq = []

for word in words_list:
    doc_freq = []
    for i in range(len(docs)):
        doc_freq.append(docs[i].count(word))
    term_freq.append(doc_freq)
    query_freq.append(query.count(word))

U, S, V = SVD(term_freq)  # use the SVD function created in svd.py
# dimension approximation
# for k - dimension approximation change 2 to k
# for 2 - dimension approximation
k = 2
Uk = U[:, 0:k]  # first two coloumns of U and all rows
Sk = S[0:k, 0:k]  # sub matrix with first two columns and rows
Vk = V[:, 0:
       k]  # first two columns of V and all rows each row is the vector of doc(i)
SkI = linAlgs.inv(Sk)
Q = np.array(query_freq)
Q = np.dot(Q.transpose(), Uk)
Q = np.dot(Q, SkI)
print(Q)
print(Vk)