def predict(self, items): features = self.extractor.transform(items) Y = ncd.prepare(map(lambda x: x[0], self.features)) X = ncd.prepare(map(lambda x: x[0], features)) matrix = sklearn.metrics.pairwise.pairwise_distances(X, Y, metric=ncd.metric, n_jobs=-1) result = [ ] # TODO: The probability is currently bogus, we could use distance measurements to fill it in for label in self.neighbors.predict(matrix): if label == -1: result.append((None, 0.0)) else: # TODO: The problem here is we don't classify noise properly, should use eps (above) result.append((label, 0.5)) return result
def train(self, items): self.clusters = { } self.noise = [ ] items = list(items) if self.verbose: sys.stderr.write("{0}: Items to train\n".format(len(items))) # Extract the features we want to use for clustering from the items self.extractor = extractor.Extractor() self.features = self.extractor.fit_transform(items) jobs = os.cpu_count() or -1 start = time.perf_counter() # Initialize the NCD code with our log feature. Currently only # one feature is used: the normalized log X = ncd.prepare(map(lambda features: features[0], self.features)) # Calculate all the pairwise distances between the items in question # The scikit DBSCAN implementation does this anyway, poorly. So why not # do it ahead of time and parralelize it ... which we do here. Then we # # TODO: This takes forever and is an O(n^2) operation # There is significant room for improvement both here, and in the following # DBSCAN usage and implementation. Techniques such as feature/item selection # BIRCH, ball trees, or many other things could make this better/faster matrix = sklearn.metrics.pairwise.pairwise_distances(X, metric=ncd.metric, n_jobs=jobs) if self.verbose: sys.stderr.write("{0}: Computed distances in {1} seconds on {2} cores\n".format( int((len(self.features) * len(self.features)) / 2), int(time.perf_counter() - start), jobs )) # Actually perform the clustering. This is fast compared to above min_samples = min(self.min_samples, len(items) / 10) dbs = sklearn.cluster.DBSCAN(metric='precomputed', eps=self.eps, min_samples=min_samples) dbs.fit(matrix) labels = dbs.labels_ # Create clusters of all the items clusters = { } noise = [ ] for i, label in enumerate(labels): if label == -1: noise.append(i) else: if label not in clusters: clusters[label] = [ ] clusters[label].append(i) self.clusters = { } for label, indexes in clusters.items(): self.clusters[label] = Cluster(label, indexes, items) self.noise = Cluster(None, noise) # Print out a rough description of that if self.verbose: sys.stderr.write("{0}: Clusters ({1} items, {2} noise)\n".format( len(self.clusters.keys()), len(items) - len(noise), len(noise) )) # Setup our neighbors classifier for predict() self.neighbors = sklearn.neighbors.KNeighborsClassifier(metric='precomputed', weights='distance') self.neighbors.fit(matrix, labels)
def train(self, items): self.clusters = { } self.noise = [ ] items = list(items) if self.verbose: sys.stderr.write("{0}: Items to train\n".format(len(items))) # Extract the features we want to use for clustering from the items self.extractor = extractor.Extractor() self.features = self.extractor.fit_transform(items) jobs = os.cpu_count() or -1 start = time.perf_counter() # Initialize the NCD code with our log feature. Currently only # one feature is used: the normalized log X = ncd.prepare(map(lambda features: features[extractor.FEATURE_LOG], self.features)) # Calculate all the pairwise distances between the items in question # The scikit DBSCAN implementation does this anyway, poorly. So why not # do it ahead of time and parralelize it ... which we do here. Then we # # TODO: This takes forever and is an O(n^2) operation # There is significant room for improvement both here, and in the following # DBSCAN usage and implementation. Techniques such as feature/item selection # BIRCH, ball trees, or many other things could make this better/faster matrix = sklearn.metrics.pairwise.pairwise_distances(X, metric=ncd.metric, n_jobs=jobs) if self.verbose: sys.stderr.write("{0}: Computed distances in {1} seconds on {2} cores\n".format( int((len(self.features) * len(self.features)) / 2), int(time.perf_counter() - start), jobs )) # Actually perform the clustering. This is fast compared to above min_samples = min(self.min_samples, len(self.features) / 10) dbs = sklearn.cluster.DBSCAN(metric='precomputed', eps=self.eps, min_samples=min_samples) dbs.fit(matrix) labels = dbs.labels_ # Create clusters of all the items clusters = { } noise = [ ] for i, label in enumerate(labels): if label == -1: noise.append(i) else: if label not in clusters: clusters[label] = [ ] clusters[label].append(i) self.clusters = { } for label, indexes in clusters.items(): self.clusters[label] = Cluster(label, indexes) self.noise = Cluster(None, noise) # Print out a rough description of that if self.verbose: sys.stderr.write("{0}: Clusters ({1} items, {2} noise)\n".format( len(self.clusters.keys()), len(self.features) - len(noise), len(noise) )) # Setup our neighbors classifier for predict() self.neighbors = sklearn.neighbors.KNeighborsClassifier(metric='precomputed', weights='distance') self.neighbors.fit(matrix, labels)