def main(datafile, normalize, ndims, copula, clusteroutput, subsample):
    X, features = read_sah_h5(datafile)
    I, all_features = read_sah_h5(datafile, just_good=False)
    if 'id' in all_features:
        ids = X[:, all_features.index('id')]
    else:
        ids = np.arange(len(X)).astype(int)

    Xorig = X
    if normalize:
        mean = np.average(X, axis=0)
        std = np.std(X, axis=0)
        std[np.nonzero(std == 0.0)] = 1.0  # Avoid NaNs
        X = (X - mean) / std

    idx = np.random.randint(len(X), size=subsample)

    X = X[idx]
    ids = ids[idx]

    if copula:
        X = np.column_stack([copula_transform(x) for x in X.T])

    # I added this for the time/freq clustering
    # to emphasize the frequency feature
    # X[:, 1] *= 1e-3

    Y = bh_sne(X, d=ndims)

    dbscan = DBSCAN(eps=1.75, min_samples=5)
    C = dbscan.fit_predict(Y)

    tree = ExtraTreesClassifier(n_estimators=100)
    tree.fit(X, C)
    for f, i in zip(features, tree.feature_importances_):
        print '%s: %f' % (f, i)

    with open(clusteroutput, 'w+') as f:
        for c, i in zip(C, ids):
            f.write('%d,%d\n' % (i, c))

    pl.scatter(Y[:, 0],
               Y[:, 1],
               color=pl.cm.spectral(C.astype(float) / np.max(C)))

    for c in np.unique(C):
        pl.bar(0,
               0,
               lw=0,
               ec='none',
               fc=pl.cm.spectral(float(c) / np.max(C)),
               label='Cluster %d' % c)
    pl.legend()

    pl.show()
def main(datafile, normalize, ndims, copula, clusteroutput, subsample):
    X, features = read_sah_h5(datafile)
    I, all_features = read_sah_h5(datafile, just_good=False)
    if 'id' in all_features:
        ids = X[:, all_features.index('id')]
    else:
        ids = np.arange(len(X)).astype(int)

    Xorig = X
    if normalize:
        mean = np.average(X, axis=0)
        std = np.std(X, axis=0)
        std[np.nonzero(std == 0.0)] = 1.0 # Avoid NaNs
        X = (X - mean) / std

    idx = np.random.randint(len(X), size=subsample)

    X = X[idx]
    ids = ids[idx]

    if copula:
        X = np.column_stack([copula_transform(x) for x in X.T])

    # I added this for the time/freq clustering
    # to emphasize the frequency feature
    # X[:, 1] *= 1e-3

    Y = bh_sne(X, d=ndims)

    dbscan = DBSCAN(eps=1.75, min_samples=5)
    C = dbscan.fit_predict(Y)

    tree = ExtraTreesClassifier(n_estimators=100)
    tree.fit(X, C)
    for f, i in zip(features, tree.feature_importances_):
        print '%s: %f' % (f, i)

    with open(clusteroutput, 'w+') as f:
        for c, i in zip(C, ids):
            f.write('%d,%d\n' % (i, c))

    pl.scatter(Y[:, 0], Y[:, 1], color=pl.cm.spectral(C.astype(float) / np.max(C)))

    for c in np.unique(C):
        pl.bar(0, 0, lw=0, ec='none', fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c)
    pl.legend()

    pl.show()
def main(datafile, feature1, feature2, bins, percentile, copula, logscale):
    X, features = read_sah_h5(datafile, just_good=False)
    x = X[:, features.index(feature1)]
    y = X[:, features.index(feature2)]

    if percentile > 0 and not copula:
        bx = np.linspace(
            scoreatpercentile(x, percentile),
            scoreatpercentile(x, 100-percentile),
            bins)
        by = np.linspace(
            scoreatpercentile(y, percentile),
            scoreatpercentile(y, 100-percentile),
            bins)
        bins = (bx, by)

    if copula:
        x = copula_transform(x)
        y = copula_transform(y)

    if logscale:
        pl.hist2d(x, y, bins=bins, norm=LogNorm())
    else:
        pl.hist2d(x, y, bins=bins)
    pl.xlabel(feature1)
    pl.ylabel(feature2)
    pl.show()
def main(datafile, feature1, feature2, normalize, clusteroutput, percentile, copula):
    X, features = read_sah_h5(datafile, just_good=False)
    if 'id' not in features:
        ids = np.arange(len(X))
    else:
        ids = X[:, features.index('id')]
    x = X[:, features.index(feature1)]
    y = X[:, features.index(feature2)]
    D = np.column_stack([x, y])

    idx = np.random.randint(len(X), size=10000)

    D = D[idx]
    ids = ids[idx]

    if normalize:
        mean = np.average(D, axis=0)
        std = np.std(D, axis=0)
        std[np.nonzero(std == 0.0)] = 1.0 # Avoid NaNs
        Dnorm = (D - mean) / std
    elif copula:
        Dnorm = np.column_stack([copula_transform(f) for f in D.T])
    else:
        Dnorm = D

    kmeans = MiniBatchKMeans(n_clusters=50)
    gmm = GMM(n_components=200, covariance_type='full', verbose=True)
    #C = gmm.fit_predict(Dnorm)
    dbscan = DBSCAN(eps=100.0, min_samples=1)
    C = dbscan.fit_predict(Dnorm)
    print C

    with open(clusteroutput, 'w+') as f:
        for c, i in zip(C, ids):
            f.write('%d,%d\n' % (i, c))

    pl.scatter(D[:, 0], D[:, 1], color=pl.cm.spectral(C.astype(float) / np.max(C)))

#    for c in np.unique(C):
#        pl.bar(0, 0, lw=0, ec='none',
#            fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c)
#    pl.legend(loc='upper left')

    if percentile > 0:
        pl.xlim(
            scoreatpercentile(x, percentile),
            scoreatpercentile(x, 100-percentile)
        )
        pl.ylim(
            scoreatpercentile(y, percentile),
            scoreatpercentile(y, 100-percentile)
        )

    pl.xlabel(feature1)
    pl.ylabel(feature2)
    pl.show()
Exemple #5
0
def main(datafile, feature1, feature2, normalize, clusteroutput, percentile,
         copula):
    X, features = read_sah_h5(datafile, just_good=False)
    if 'id' not in features:
        ids = np.arange(len(X))
    else:
        ids = X[:, features.index('id')]
    x = X[:, features.index(feature1)]
    y = X[:, features.index(feature2)]
    D = np.column_stack([x, y])

    idx = np.random.randint(len(X), size=10000)

    D = D[idx]
    ids = ids[idx]

    if normalize:
        mean = np.average(D, axis=0)
        std = np.std(D, axis=0)
        std[np.nonzero(std == 0.0)] = 1.0  # Avoid NaNs
        Dnorm = (D - mean) / std
    elif copula:
        Dnorm = np.column_stack([copula_transform(f) for f in D.T])
    else:
        Dnorm = D

    kmeans = MiniBatchKMeans(n_clusters=50)
    gmm = GMM(n_components=200, covariance_type='full', verbose=True)
    #C = gmm.fit_predict(Dnorm)
    dbscan = DBSCAN(eps=100.0, min_samples=1)
    C = dbscan.fit_predict(Dnorm)
    print C

    with open(clusteroutput, 'w+') as f:
        for c, i in zip(C, ids):
            f.write('%d,%d\n' % (i, c))

    pl.scatter(D[:, 0],
               D[:, 1],
               color=pl.cm.spectral(C.astype(float) / np.max(C)))

    #    for c in np.unique(C):
    #        pl.bar(0, 0, lw=0, ec='none',
    #            fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c)
    #    pl.legend(loc='upper left')

    if percentile > 0:
        pl.xlim(scoreatpercentile(x, percentile),
                scoreatpercentile(x, 100 - percentile))
        pl.ylim(scoreatpercentile(y, percentile),
                scoreatpercentile(y, 100 - percentile))

    pl.xlabel(feature1)
    pl.ylabel(feature2)
    pl.show()
def main(datafile, outputfile):
    X, features = read_sah_h5(datafile, just_good=False)

    result = []
    progress = ProgressBar(widgets=['Computing dependencies: ', Bar('='), ETA()])
    for f1, f2 in progress(list(combinations(features, 2))):
        x = X[:, features.index(f1)]
        y = X[:, features.index(f2)]
        result.append('%s,%s,%f' % (f1, f2, rdc(x, y, n=5)))

    with open(outputfile, 'w+') as f:
        f.write('\n'.join(result))
def main(datafile, outputfile):
    X, features = read_sah_h5(datafile, just_good=False)

    result = []
    progress = ProgressBar(
        widgets=['Computing dependencies: ',
                 Bar('='), ETA()])
    for f1, f2 in progress(list(combinations(features, 2))):
        x = X[:, features.index(f1)]
        y = X[:, features.index(f2)]
        result.append('%s,%s,%f' % (f1, f2, rdc(x, y, n=5)))

    with open(outputfile, 'w+') as f:
        f.write('\n'.join(result))
def main(datafile, feature1, bins, percentile, logscale):
    X, features = read_sah_h5(datafile, just_good=False)
    x = X[:, features.index(feature1)]

    if percentile > 0:
        bins = np.linspace(
            scoreatpercentile(x, percentile),
            scoreatpercentile(x, 100-percentile),
            bins)

    pl.hist(x, bins=bins, histtype='step', color='k')

    if logscale:
        pl.set_yscale('log')
    pl.xlabel(feature1)
    pl.show()
def main(datafile, feature1, feature2, clusterfile, clusterid,
         bins, percentile, copula, logscale):
    X, features = read_sah_h5(datafile, just_good=False)
    x = X[:, features.index(feature1)]
    y = X[:, features.index(feature2)]
    if 'id' in features:
        ids = X[:, features.index('id')]
    else:
        ids = np.arange(len(X)).astype(int)

    include = {}
    with open(clusterfile, 'r') as f:
        for line in f:
            i, c = map(float, line.strip().split(','))
            include[i] = (c == clusterid)

    include_mask = np.array([include.get(i, False) for i in ids])
    x = x[include_mask]
    y = y[include_mask]

    if percentile > 0 and not copula:
        bx = np.linspace(
            scoreatpercentile(x, percentile),
            scoreatpercentile(x, 100-percentile),
            bins)
        by = np.linspace(
            scoreatpercentile(y, percentile),
            scoreatpercentile(y, 100-percentile),
            bins)
        bins = (bx, by)

    if copula:
        x = copula_transform(x)
        y = copula_transform(y)

    if logscale:
        pl.hist2d(x, y, bins=bins, norm=LogNorm())
    else:
        pl.hist2d(x, y, bins=bins)
    pl.xlabel(feature1)
    pl.ylabel(feature2)
    pl.colorbar()
    pl.show()
Exemple #10
0
def main(datafile, feature1, feature2, clusterfile, clusterid, bins,
         percentile, copula, logscale):
    X, features = read_sah_h5(datafile, just_good=False)
    x = X[:, features.index(feature1)]
    y = X[:, features.index(feature2)]
    if 'id' in features:
        ids = X[:, features.index('id')]
    else:
        ids = np.arange(len(X)).astype(int)

    include = {}
    with open(clusterfile, 'r') as f:
        for line in f:
            i, c = map(float, line.strip().split(','))
            include[i] = (c == clusterid)

    include_mask = np.array([include.get(i, False) for i in ids])
    x = x[include_mask]
    y = y[include_mask]

    if percentile > 0 and not copula:
        bx = np.linspace(scoreatpercentile(x, percentile),
                         scoreatpercentile(x, 100 - percentile), bins)
        by = np.linspace(scoreatpercentile(y, percentile),
                         scoreatpercentile(y, 100 - percentile), bins)
        bins = (bx, by)

    if copula:
        x = copula_transform(x)
        y = copula_transform(y)

    if logscale:
        pl.hist2d(x, y, bins=bins, norm=LogNorm())
    else:
        pl.hist2d(x, y, bins=bins)
    pl.xlabel(feature1)
    pl.ylabel(feature2)
    pl.colorbar()
    pl.show()
Exemple #11
0
def main(datafile, feature1, feature2, bins, percentile, copula, logscale):
    X, features = read_sah_h5(datafile, just_good=False)
    x = X[:, features.index(feature1)]
    y = X[:, features.index(feature2)]

    if percentile > 0 and not copula:
        bx = np.linspace(scoreatpercentile(x, percentile),
                         scoreatpercentile(x, 100 - percentile), bins)
        by = np.linspace(scoreatpercentile(y, percentile),
                         scoreatpercentile(y, 100 - percentile), bins)
        bins = (bx, by)

    if copula:
        x = copula_transform(x)
        y = copula_transform(y)

    if logscale:
        pl.hist2d(x, y, bins=bins, norm=LogNorm())
    else:
        pl.hist2d(x, y, bins=bins)
    pl.xlabel(feature1)
    pl.ylabel(feature2)
    pl.show()