Ejemplo n.º 1
0
def kmeans_doit(self, csvFilename, csvPathname, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    cols = ",".join(map(str,range(11)))
    kwargs = {
        'k': 1, 
        'epsilon': 1e-6,
        'cols': cols, 
        'destination_key': 'KMeansModel.hex'
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
    h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)


    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
    KMeansModel = inspect['KMeansModel']
    clusters = KMeansModel['clusters'][0]
    print "clusters:", h2o.dump_json(clusters)
    
    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
Ejemplo n.º 2
0
def kmeans_doit(self, csvFilename, csvPathname, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                 key2=csvFilename + ".hex",
                                 timeoutSecs=10)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    cols = ",".join(map(str, range(11)))
    kwargs = {
        'k': 1,
        'epsilon': 1e-6,
        'cols': cols,
        'destination_key': 'KMeansModel.hex'
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
    h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
    inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
    ### print h2o.dump_json(inspect)

    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    KMeansModel = inspect['KMeansModel']
    clusters = KMeansModel['clusters'][0]
    print "clusters:", h2o.dump_json(clusters)

    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
Ejemplo n.º 3
0
def kmeans_doit(self, csvFilename, bucket, csvPathname, num_rows, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseResult = h2i.import_parse(
        bucket=bucket, path=csvPathname, schema="put", hex_key=csvFilename + ".hex", timeoutSecs=10
    )
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    kwargs = {
        "k": 1,
        "initialization": "Furthest",
        "destination_key": "KMeansModel.hex",
        "max_iter": 25,
        # reuse the same seed, to get deterministic results (otherwise sometimes fails
        "seed": 265211114317615310,
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeans(
        parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs
    )
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
        (elapsed / timeoutSecs) * 100
    )

    (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, "d", **kwargs)

    expected = [
        (
            [
                -0.0006628900000000158,
                -0.0004671200060434639,
                0.0009330300069879741,
                0.0007883800000000272,
                0.0007548200000000111,
                0.0005617899864856153,
                0.0013246499999999897,
                0.0004036299999999859,
                -0.0014307100000000314,
                0.0021324000161308796,
                0.00154,
            ],
            num_rows,
            None,
        )
    ]
    # all are multipliers of expected tuple value
    allowedDelta = (0.01, 0.01, 0.01)
    h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)

    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    inspect = h2o_cmd.runInspect(None, key=kmeans["destination_key"])
    KMeansModel = inspect["KMeansModel"]
    clusters = KMeansModel["centers"][0]
    print "clusters:", h2o.dump_json(clusters)

    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
Ejemplo n.º 4
0
def kmeans_doit(self, csvFilename, csvPathname, num_rows, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                 key2=csvFilename + ".hex",
                                 timeoutSecs=10)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    cols = ",".join(map(str, range(11)))
    kwargs = {
        'k': 1,
        'epsilon': 1e-6,
        'cols': cols,
        'destination_key': 'KMeansModel.hex',
        # reuse the same seed, to get deterministic results (otherwise sometimes fails
        'seed': 265211114317615310,
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

    (centers,
     tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname,
                                                   parseKey, 'd', **kwargs)

    expected = [([
        -0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741,
        0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153,
        0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314,
        0.0021324000161308796, 0.00154
    ], num_rows, None)]
    # all are multipliers of expected tuple value
    allowedDelta = (0.01, 0.01, 0.01)
    h2o_kmeans.compareResultsToExpected(self,
                                        tupleResultList,
                                        expected,
                                        allowedDelta,
                                        trial=0)

    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
    KMeansModel = inspect['KMeansModel']
    clusters = KMeansModel['clusters'][0]
    print "clusters:", h2o.dump_json(clusters)

    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
Ejemplo n.º 5
0
def kmeans_doit(self, csvFilename, bucket, csvPathname, numRows, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=20)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    kwargs = {
        'k': 1, 
        'initialization': 'Furthest',
        'destination_key': 'KMeansModel.hex',
        'max_iter': 25,
        # reuse the same seed, to get deterministic results (otherwise sometimes fails
        'seed': 265211114317615310,
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

    (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)

    expected = [
        ([-0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154], numRows, None)
    ]
    # all are multipliers of expected tuple value
    allowedDelta = (0.01, 0.01, 0.01)
    h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)

    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    # inspect doesn't work
    # inspect = h2o_cmd.runInspect(None, key=kmeans['model']['_key'])
    # KMeansModel = inspect['KMeansModel']
    modelView = h2o.nodes[0].kmeans_view(model='KMeansModel.hex')
    h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))
    model = modelView['model']
    clusters = model['centers']
    within_cluster_variances = model['within_cluster_variances']
    total_within_SS = model['total_within_SS']
    print "within_cluster_variances:", within_cluster_variances
    print "total_within_SS:", total_within_SS
    
    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
Ejemplo n.º 6
0
def kmeans_doit(self, csvFilename, csvPathname, num_rows, timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=csvFilename + ".hex", timeoutSecs=10)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    cols = ",".join(map(str,range(11)))
    kwargs = {
        'k': 1, 
        'epsilon': 1e-6,
        'cols': cols, 
        'destination_key': 'KMeansModel.hex',
        # reuse the same seed, to get deterministic results (otherwise sometimes fails
        'seed': 265211114317615310,
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

    (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseKey, 'd', **kwargs)

    expected = [
        ([-0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741, 0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153, 0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314, 0.0021324000161308796, 0.00154], num_rows, None)
    ]
    # all are multipliers of expected tuple value
    allowedDelta = (0.01, 0.01, 0.01)
    h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=0)



    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    inspect = h2o_cmd.runInspect(None, key=kmeans['destination_key'])
    KMeansModel = inspect['KMeansModel']
    clusters = KMeansModel['clusters'][0]
    print "clusters:", h2o.dump_json(clusters)
    
    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)
Ejemplo n.º 7
0
def kmeans_doit(self,
                csvFilename,
                bucket,
                csvPathname,
                numRows,
                timeoutSecs=30):
    print "\nStarting KMeans of", csvFilename
    parseResult = h2i.import_parse(bucket=bucket,
                                   path=csvPathname,
                                   schema='put',
                                   hex_key=csvFilename + ".hex",
                                   timeoutSecs=20)
    # hastie has two values, 1 and -1.
    # we could not specify cols, but this is more fun
    kwargs = {
        'k': 1,
        'initialization': 'Furthest',
        'destination_key': 'KMeansModel.hex',
        'max_iter': 25,
        # reuse the same seed, to get deterministic results (otherwise sometimes fails
        'seed': 265211114317615310,
    }
    start = time.time()
    kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
        timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
    elapsed = time.time() - start
    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

    (centers,
     tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname,
                                                   parseResult, 'd', **kwargs)

    expected = [([
        -0.0006628900000000158, -0.0004671200060434639, 0.0009330300069879741,
        0.0007883800000000272, 0.0007548200000000111, 0.0005617899864856153,
        0.0013246499999999897, 0.0004036299999999859, -0.0014307100000000314,
        0.0021324000161308796, 0.00154
    ], numRows, None)]
    # all are multipliers of expected tuple value
    allowedDelta = (0.01, 0.01, 0.01)
    h2o_kmeans.compareResultsToExpected(self,
                                        tupleResultList,
                                        expected,
                                        allowedDelta,
                                        trial=0)

    # compare this kmeans to the first one. since the files are replications, the results
    # should be similar?
    # inspect doesn't work
    # inspect = h2o_cmd.runInspect(None, key=kmeans['model']['_key'])
    # KMeansModel = inspect['KMeansModel']
    modelView = h2o.nodes[0].kmeans_view(model='KMeansModel.hex')
    h2o.verboseprint("KMeans2ModelView:", h2o.dump_json(modelView))
    model = modelView['model']
    clusters = model['centers']
    within_cluster_variances = model['within_cluster_variances']
    total_within_SS = model['total_within_SS']
    print "within_cluster_variances:", within_cluster_variances
    print "total_within_SS:", total_within_SS

    if self.clusters1:
        h2o_kmeans.compareToFirstKMeans(self, clusters, self.clusters1)
    else:
        self.clusters1 = copy.deepcopy(clusters)