def main(datafile, normalize, ndims, copula, clusteroutput, subsample): X, features = read_sah_h5(datafile) I, all_features = read_sah_h5(datafile, just_good=False) if 'id' in all_features: ids = X[:, all_features.index('id')] else: ids = np.arange(len(X)).astype(int) Xorig = X if normalize: mean = np.average(X, axis=0) std = np.std(X, axis=0) std[np.nonzero(std == 0.0)] = 1.0 # Avoid NaNs X = (X - mean) / std idx = np.random.randint(len(X), size=subsample) X = X[idx] ids = ids[idx] if copula: X = np.column_stack([copula_transform(x) for x in X.T]) # I added this for the time/freq clustering # to emphasize the frequency feature # X[:, 1] *= 1e-3 Y = bh_sne(X, d=ndims) dbscan = DBSCAN(eps=1.75, min_samples=5) C = dbscan.fit_predict(Y) tree = ExtraTreesClassifier(n_estimators=100) tree.fit(X, C) for f, i in zip(features, tree.feature_importances_): print '%s: %f' % (f, i) with open(clusteroutput, 'w+') as f: for c, i in zip(C, ids): f.write('%d,%d\n' % (i, c)) pl.scatter(Y[:, 0], Y[:, 1], color=pl.cm.spectral(C.astype(float) / np.max(C))) for c in np.unique(C): pl.bar(0, 0, lw=0, ec='none', fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c) pl.legend() pl.show()
def main(datafile, feature1, feature2, bins, percentile, copula, logscale): X, features = read_sah_h5(datafile, just_good=False) x = X[:, features.index(feature1)] y = X[:, features.index(feature2)] if percentile > 0 and not copula: bx = np.linspace( scoreatpercentile(x, percentile), scoreatpercentile(x, 100-percentile), bins) by = np.linspace( scoreatpercentile(y, percentile), scoreatpercentile(y, 100-percentile), bins) bins = (bx, by) if copula: x = copula_transform(x) y = copula_transform(y) if logscale: pl.hist2d(x, y, bins=bins, norm=LogNorm()) else: pl.hist2d(x, y, bins=bins) pl.xlabel(feature1) pl.ylabel(feature2) pl.show()
def main(datafile, feature1, feature2, normalize, clusteroutput, percentile, copula): X, features = read_sah_h5(datafile, just_good=False) if 'id' not in features: ids = np.arange(len(X)) else: ids = X[:, features.index('id')] x = X[:, features.index(feature1)] y = X[:, features.index(feature2)] D = np.column_stack([x, y]) idx = np.random.randint(len(X), size=10000) D = D[idx] ids = ids[idx] if normalize: mean = np.average(D, axis=0) std = np.std(D, axis=0) std[np.nonzero(std == 0.0)] = 1.0 # Avoid NaNs Dnorm = (D - mean) / std elif copula: Dnorm = np.column_stack([copula_transform(f) for f in D.T]) else: Dnorm = D kmeans = MiniBatchKMeans(n_clusters=50) gmm = GMM(n_components=200, covariance_type='full', verbose=True) #C = gmm.fit_predict(Dnorm) dbscan = DBSCAN(eps=100.0, min_samples=1) C = dbscan.fit_predict(Dnorm) print C with open(clusteroutput, 'w+') as f: for c, i in zip(C, ids): f.write('%d,%d\n' % (i, c)) pl.scatter(D[:, 0], D[:, 1], color=pl.cm.spectral(C.astype(float) / np.max(C))) # for c in np.unique(C): # pl.bar(0, 0, lw=0, ec='none', # fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c) # pl.legend(loc='upper left') if percentile > 0: pl.xlim( scoreatpercentile(x, percentile), scoreatpercentile(x, 100-percentile) ) pl.ylim( scoreatpercentile(y, percentile), scoreatpercentile(y, 100-percentile) ) pl.xlabel(feature1) pl.ylabel(feature2) pl.show()
def main(datafile, feature1, feature2, normalize, clusteroutput, percentile, copula): X, features = read_sah_h5(datafile, just_good=False) if 'id' not in features: ids = np.arange(len(X)) else: ids = X[:, features.index('id')] x = X[:, features.index(feature1)] y = X[:, features.index(feature2)] D = np.column_stack([x, y]) idx = np.random.randint(len(X), size=10000) D = D[idx] ids = ids[idx] if normalize: mean = np.average(D, axis=0) std = np.std(D, axis=0) std[np.nonzero(std == 0.0)] = 1.0 # Avoid NaNs Dnorm = (D - mean) / std elif copula: Dnorm = np.column_stack([copula_transform(f) for f in D.T]) else: Dnorm = D kmeans = MiniBatchKMeans(n_clusters=50) gmm = GMM(n_components=200, covariance_type='full', verbose=True) #C = gmm.fit_predict(Dnorm) dbscan = DBSCAN(eps=100.0, min_samples=1) C = dbscan.fit_predict(Dnorm) print C with open(clusteroutput, 'w+') as f: for c, i in zip(C, ids): f.write('%d,%d\n' % (i, c)) pl.scatter(D[:, 0], D[:, 1], color=pl.cm.spectral(C.astype(float) / np.max(C))) # for c in np.unique(C): # pl.bar(0, 0, lw=0, ec='none', # fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c) # pl.legend(loc='upper left') if percentile > 0: pl.xlim(scoreatpercentile(x, percentile), scoreatpercentile(x, 100 - percentile)) pl.ylim(scoreatpercentile(y, percentile), scoreatpercentile(y, 100 - percentile)) pl.xlabel(feature1) pl.ylabel(feature2) pl.show()
def main(datafile, outputfile): X, features = read_sah_h5(datafile, just_good=False) result = [] progress = ProgressBar(widgets=['Computing dependencies: ', Bar('='), ETA()]) for f1, f2 in progress(list(combinations(features, 2))): x = X[:, features.index(f1)] y = X[:, features.index(f2)] result.append('%s,%s,%f' % (f1, f2, rdc(x, y, n=5))) with open(outputfile, 'w+') as f: f.write('\n'.join(result))
def main(datafile, outputfile): X, features = read_sah_h5(datafile, just_good=False) result = [] progress = ProgressBar( widgets=['Computing dependencies: ', Bar('='), ETA()]) for f1, f2 in progress(list(combinations(features, 2))): x = X[:, features.index(f1)] y = X[:, features.index(f2)] result.append('%s,%s,%f' % (f1, f2, rdc(x, y, n=5))) with open(outputfile, 'w+') as f: f.write('\n'.join(result))
def main(datafile, feature1, bins, percentile, logscale): X, features = read_sah_h5(datafile, just_good=False) x = X[:, features.index(feature1)] if percentile > 0: bins = np.linspace( scoreatpercentile(x, percentile), scoreatpercentile(x, 100-percentile), bins) pl.hist(x, bins=bins, histtype='step', color='k') if logscale: pl.set_yscale('log') pl.xlabel(feature1) pl.show()
def main(datafile, feature1, feature2, clusterfile, clusterid, bins, percentile, copula, logscale): X, features = read_sah_h5(datafile, just_good=False) x = X[:, features.index(feature1)] y = X[:, features.index(feature2)] if 'id' in features: ids = X[:, features.index('id')] else: ids = np.arange(len(X)).astype(int) include = {} with open(clusterfile, 'r') as f: for line in f: i, c = map(float, line.strip().split(',')) include[i] = (c == clusterid) include_mask = np.array([include.get(i, False) for i in ids]) x = x[include_mask] y = y[include_mask] if percentile > 0 and not copula: bx = np.linspace( scoreatpercentile(x, percentile), scoreatpercentile(x, 100-percentile), bins) by = np.linspace( scoreatpercentile(y, percentile), scoreatpercentile(y, 100-percentile), bins) bins = (bx, by) if copula: x = copula_transform(x) y = copula_transform(y) if logscale: pl.hist2d(x, y, bins=bins, norm=LogNorm()) else: pl.hist2d(x, y, bins=bins) pl.xlabel(feature1) pl.ylabel(feature2) pl.colorbar() pl.show()
def main(datafile, feature1, feature2, clusterfile, clusterid, bins, percentile, copula, logscale): X, features = read_sah_h5(datafile, just_good=False) x = X[:, features.index(feature1)] y = X[:, features.index(feature2)] if 'id' in features: ids = X[:, features.index('id')] else: ids = np.arange(len(X)).astype(int) include = {} with open(clusterfile, 'r') as f: for line in f: i, c = map(float, line.strip().split(',')) include[i] = (c == clusterid) include_mask = np.array([include.get(i, False) for i in ids]) x = x[include_mask] y = y[include_mask] if percentile > 0 and not copula: bx = np.linspace(scoreatpercentile(x, percentile), scoreatpercentile(x, 100 - percentile), bins) by = np.linspace(scoreatpercentile(y, percentile), scoreatpercentile(y, 100 - percentile), bins) bins = (bx, by) if copula: x = copula_transform(x) y = copula_transform(y) if logscale: pl.hist2d(x, y, bins=bins, norm=LogNorm()) else: pl.hist2d(x, y, bins=bins) pl.xlabel(feature1) pl.ylabel(feature2) pl.colorbar() pl.show()
def main(datafile, feature1, feature2, bins, percentile, copula, logscale): X, features = read_sah_h5(datafile, just_good=False) x = X[:, features.index(feature1)] y = X[:, features.index(feature2)] if percentile > 0 and not copula: bx = np.linspace(scoreatpercentile(x, percentile), scoreatpercentile(x, 100 - percentile), bins) by = np.linspace(scoreatpercentile(y, percentile), scoreatpercentile(y, 100 - percentile), bins) bins = (bx, by) if copula: x = copula_transform(x) y = copula_transform(y) if logscale: pl.hist2d(x, y, bins=bins, norm=LogNorm()) else: pl.hist2d(x, y, bins=bins) pl.xlabel(feature1) pl.ylabel(feature2) pl.show()