コード例 #1
0
ファイル: svd.py プロジェクト: victor-axelsson/SimAPI
class SVD:
    def __init__(self, dataFolder, sessionName):
        self.dataFolder = dataFolder
        self.textFileReader = textFile.TextFile()
        self.wordList = Wordlist()
        self.sessionName = sessionName
        self.logger = logger.Logger()
        self.cache = Cache()
        self.logger.log("Starting a new SVD Session: " + sessionName)

    def getOneHotMatrix(self, data, wordlist):
        nr_cols = len(wordlist)
        self.logger.startLongRunningLog(len(data), 10000)
        counter = 0
        ts = time.time()
        matrix = lil_matrix((len(data), len(wordlist)))
        for session in data:
            row = np.zeros(nr_cols)
            for device in session:
                row[wordlist[device]] += 1

            matrix[counter] = row
            counter += 1
            self.logger.log("")
        self.logger.endLongRunningLog()
        print("matrix: {} r1: {} c1: {}".format(matrix.shape, matrix[0],
                                                matrix[0, 0]))
        return matrix

    def getOneHotSquareMatrixSession(self, data, wordlist):
        nr_cols = len(wordlist)
        matrix = lil_matrix((len(wordlist), len(wordlist)))

        for dataRow in data:
            prev = None
            row = np.zeros(nr_cols)
            sessionItems = []
            for item in dataRow:
                for sessionItem in sessionItems:
                    matrix[wordlist[sessionItem], wordlist[item]] += 1
                    matrix[wordlist[item], wordlist[sessionItem]] += 1

                sessionItems.append(item)

        return matrix

    def getOneHotMatrixFromGraph(self, data, keys):
        matrix = []
        size = len(keys)
        matrix = lil_matrix((size, size))
        self.logger.startLongRunningLog(size, 100)

        counter = 0
        for i in range(size):
            row = np.zeros(size)
            for j in range(size):
                if keys[i] in data and keys[j] in data[keys[i]]:
                    row[j] = data[keys[i]][keys[j]]['weight']

            ## Instead of zero add avg as default value ##
            '''
			rowWithMean = np.full(size, np.mean(row))
			for j in range(size):
				if keys[i] in data and keys[j] in data[keys[i]]:
					rowWithMean[j] = data[keys[i]][keys[j]]['weight'] 

			matrix[counter] = rowWithMean
			'''
            matrix[counter] = row
            counter += 1
            self.logger.log("Building graph matrix...")

        self.logger.endLongRunningLog()
        return matrix

    def createOneHotFromName(self, items):
        row = np.zeros(len(self.wl))
        for item in items:
            row[self.wl[item]] += 1
        return row

    def getConceptFromOneHot(self, vector):
        print(self.vt.shape)
        print(self.v.shape)
        print(vector.reshape((1, -1)).shape)

        return np.inner(vector.reshape((1, -1)), self.vt)
        #return np.dot(vector.reshape((1, -1)), self.v)

    def cosineSimilarity(self, v1, v2, n_cols=None, applySigma=True):
        if applySigma:
            return cosine_similarity([np.multiply(v1, self.sigma)],
                                     [np.multiply(v2, self.sigma)])[0][0]
        else:
            return cosine_similarity([v1], [v2])[0][0]

    def exportSimilarityUnidirectionalSquareMatrix(self, filepath):
        size = len(self.u)

        self.logger("Exporting nodes...")
        with open(filepath + "/nodes.csv", "w") as text_file:
            text_file.write("Id,Label\n")
            for i in range(size):
                text_file.write("{},{}\n".format(i, self.reversedWl[i]))

        self.logger.startLongRunningLog((size * (size - 1) / 2), 10000)
        with open(filepath + "/edges.csv", "w") as text_file:
            text_file.write("Source,Target,Weight\n")
            for i in range(size):
                concepts_i = self.u[i]
                for j in range(i + 1, size):
                    concepts_j = self.u[j]
                    sim = self.cosineSimilarity(concepts_i, concepts_j)

                    if (sim > 0.5):
                        text_file.write("{},{},{}\n".format(i, j, sim))
                    self.logger("Saving edges...")

    def getMostSimilarInU(self, concepts, threshold=0.9):
        best = 0
        bestRow = None
        session = -1
        counter = 0
        items = []

        uWithSigma = np.matmul(self.u, np.diag(self.sigma))
        similarities = cosine_similarity([np.multiply(concepts, self.sigma)],
                                         uWithSigma)[0]
        for similarity in similarities:
            if similarity > best:
                best = similarity
                bestRow = -1
                session = self.reversedWl[counter]

            if similarity > threshold:
                #print("{} SIM:{}".format(self.reversedWl[counter], similarity))

                items.append({
                    'item': self.reversedWl[counter],
                    'score': similarity
                })

            counter += 1
        return session, bestRow, best, items

    def getHighestIndex(self, vector, nr):
        highest = []
        for i in range(len(vector)):
            highest.append({'index': i, 'val': (vector[i] * self.sigma[i])})

        newlist = sorted(highest, key=lambda k: k['val'], reverse=True)
        indexes = []
        for i in range(nr):
            indexes.append(newlist[i]['index'])

        return indexes

    def selectMaxFromVFromColumns(self, columns):
        best = -1
        bestItem = None
        counter = 0

        for item in self.v:
            score = 0
            for c in columns:
                score += item[c] * self.sigma[c]

            if score > best:
                best = score
                bestItem = self.reversedWl[counter]

            counter += 1

        return bestItem, score

    def getWordlist(self):
        return self.wl

    def getSigma(self):
        return self.sigma.tolist()

    def visualize(self):

        x1 = self.u[:, 0]
        y1 = self.u[:, 1]
        z1 = self.u[:, 2]

        fig = plt.figure()
        ax = Axes3D(fig)
        ax.scatter(x1, y1, z1)
        plt.show()
        '''
		x1 = np.multiply(self.u[:,0], self.sigma[0])
		y1 = np.multiply(self.u[:,1], self.sigma[1])
		z1 = np.multiply(self.u[:,2], self.sigma[2])

		x2 = np.multiply(self.v[:,0], self.sigma[0])
		y2 = np.multiply(self.v[:,1], self.sigma[1])
		z2 = np.multiply(self.v[:,2], self.sigma[2])
		'''
        '''
		x1 = self.u[:,0]
		y1 = self.u[:,1]
		z1 = self.u[:,2]

		
		x2 = self.v[:,0]
		y2 = self.v[:,1]
		z2 = self.v[:,2]
		
		fig = plt.figure()
		ax = Axes3D(fig)
		ax.scatter(x1, y1, z1)
		ax.scatter(x2, y2, z2)
		plt.show()
		'''
        '''
		y = self.sigma
		x = range(len(self.sigma))
		plt.plot(x, y)
		plt.show()
		'''

    def getItemFromV(self, itemKey):
        if itemKey in self.wl:
            index = self.wl[itemKey]
            return np.multiply(self.v[index], self.sigma).tolist(), True
        else:
            return None, False

    def runSvdOnJsonGraph(self, n_components, n_iter):
        self.logger.log("Loading data...")
        self.data = self.textFileReader.read_json(self.dataFolder)
        self.logger.log("Generating wordlist...")
        items, wl, reversedWl = self.wordList.getWordlistFromGraph(self.data)
        self.wl = wl
        self.reversedWl = reversedWl
        matrix, wasCached = self.cache.lazyCache(
            self.sessionName + "device_device_adjecency.matrix",
            self.getOneHotMatrixFromGraph, {
                'data': self.data,
                'keys': items
            })
        if wasCached:
            self.logger.log("Loaded matrix from cache")

        self._runSvd(matrix, n_components, n_iter)

    def _runSvd(self, matrix, n_components, n_iter):

        #Load the factorization from disk
        self.u, uWasCached = self.cache.loadNPIfExists(self.sessionName +
                                                       "u.bin")
        self.sigma, sigmaWasCached = self.cache.loadNPIfExists(
            self.sessionName + "sigma.bin")
        self.vt, vtWasCached = self.cache.loadNPIfExists(self.sessionName +
                                                         "vt.bin")
        self.v, vWasCached = self.cache.loadNPIfExists(self.sessionName +
                                                       "v.bin")

        #If any of the cached matrices was missing
        if not uWasCached or not sigmaWasCached or not vtWasCached and not vWasCached:
            self.logger(
                "Factorization was not present, calculating... (Might take a while)"
            )
            self.logger(
                "Fitting the randomized_svd with {} iterations and {} components"
                .format(n_iter, n_components))
            U, Sigma, VT = randomized_svd(matrix,
                                          n_components=n_components,
                                          n_iter=n_iter)

            self.sigma = Sigma
            self.u = U
            self.vt = VT
            self.v = np.transpose(self.vt)

            self.u.dump(self.sessionName + "u.bin")
            self.sigma.dump(self.sessionName + "sigma.bin")
            self.vt.dump(self.sessionName + "vt.bin")
            self.v.dump(self.sessionName + "v.bin")
        else:
            self.logger("Loaded factorization from disk")

        self.logger(matrix.shape)
        self.logger(self.vt.shape)
        self.logger(self.sigma.shape)

        self.logger(self.u.shape)
        self.logger(self.v.shape)

    def runSvdOnCsv(self, n_components, n_iter):
        self.logger.log("Loading data...")
        self.data = self.textFileReader.read_folder(self.dataFolder)
        self.logger.log("Generating wordlist...")
        items, wl, reversedWl = self.wordList.getWordlist(self.data)
        self.wl = wl
        self.reversedWl = reversedWl
        matrix, wasCached = self.cache.lazyCache(
            self.sessionName + "session_device_adjecency.matrix",
            self.getOneHotSquareMatrixSession, {
                'data': self.data,
                'wordlist': wl
            })
        if wasCached:
            self.logger.log("Loaded matrix from cache")

        self._runSvd(matrix, n_components, n_iter)

    def runSvdOnCsvMatrix(self, n_components, n_iter):
        self.logger.log("Loading data...")
        data = self.textFileReader.read_folder(self.dataFolder, '*.csv')
        m = np.matrix(data[1:], dtype=np.float32)
        m = np.delete(m, [0], axis=1)
        matrix = normalize(m, axis=1, norm='max')
        U, Sigma, VT = randomized_svd(matrix,
                                      n_components=n_components,
                                      n_iter=n_iter)

        self.U = U
        self.Sigma = Sigma
        self.VT = VT
コード例 #2
0
class Distribution:
    dataFolder = None
    textFileReader = None
    cache = None
    wordlist = None

    def __init__(self, dataFolder):
        self.dataFolder = dataFolder
        self.textFileReader = textFile.TextFile()
        self.cache = Cache()
        self.calcDistribution()

    def _readDataFromDisk(self):
        return self.textFileReader.read_folder(self.dataFolder)

    def _readSortedList(self, data):
        items = {}
        for row in data:
            for col in row:
                if col not in items:
                    items[col] = 0
                items[col] += 1

        itemList = []
        for item in items.keys():
            itemList.append({'item': item, 'val': items[item]})

        return sorted(itemList, key=lambda k: k['val'], reverse=True)

    def _readWordList(self):
        wordlist = {}
        for i in range(len(self.sortedList)):
            item = self.sortedList[i]
            wordlist[item['item']] = i

        return wordlist

    def calcDistribution(self):
        data, wasCached = self.cache.lazyCache("distribution.pkl",
                                               self._readDataFromDisk, {})
        if wasCached:
            print("Loaded distribution from cache")

        items = {}
        for row in data:
            for col in row:
                if col not in items:
                    items[col] = 0
                items[col] += 1

        itemList = []
        for item in items.keys():
            itemList.append({'item': item, 'val': items[item]})

        self.sortedList, sortedWasCached = self.cache.lazyCache(
            "sortedDist.pkl", self._readSortedList, {'data': data})
        if sortedWasCached:
            print("Loaded sortedList from cache")
        self.wordlist, wordListWasCached = self.cache.lazyCache(
            "wordListDist.pkl", self._readWordList, {})
        if wordListWasCached:
            print("Loaded wordlist from cache")

    def getDistribution(self):
        return self.sortedList, self.wordlist