class Topics: def __init__(self): self.metadata = Metadata(True, 'lda') self.dataProvider = DataProvider() self.topicFile = 'tmp/topics.npy' self.kMeansFile = 'tmp/kmeans.pkl.npy' def gatherData(self, t): self.clips = self.dataProvider.getClips() self.model = self.metadata.createModel(self.clips, t) def getTopic(self, clip): vector = self.metadata.getVectors(clip) result = self.kmeans.predict(vector.reshape(1, -1)) return result # return np.argmax(vector) def createTopics(self, clips, t, k): self.gatherData(t) self.createClusters(clips, k) topics = [] for idx, clip in enumerate(clips): topics.append(self.getTopic(clip)) self.topics = np.hstack(topics) return self.topics def save(self): self.metadata.save() np.save(self.topicFile, self.topics) joblib.dump(self.kmeans, self.kMeansFile) def load(self): if not self.metadata.load(): return False try: self.topics = np.load(self.topicFile) self.kmeans = joblib.load(self.kMeansFile) return True except (IOError): return False # ------------------------------------------------ # K Means on topic data... def createClusters(self, clips, k): vectors = [] for idx, clip in enumerate(clips): vectors.append(self.metadata.getVectors(clip)) self.vectors = np.vstack(vectors) print self.clusterKMeans(self.vectors, k) def clusterKMeans(self, data, k): self.kmeans = KMeans(n_clusters=k, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=1, random_state=None, copy_x=True, n_jobs=1) self.kmeans.fit(data) return self.kmeans.inertia_