Exemple #1
0
    def test_GLM_hastie(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        bucket = 'home-0xdiag-datasets'
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'standard' + '/' + csvFilename
        glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=75)
        fullPathname = h2i.find_folder_and_filename(bucket,
                                                    csvPathname,
                                                    returnFullPath=True)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)
        glm_doit(self, filename2x, None, pathname2x, timeoutSecs=75)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2x, pathname2x, pathname4x)

        print "Iterating 3 times on this last one for perf compare"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_doit(self, filename4x, None, pathname4x, timeoutSecs=150)
    def test_1mx10_hastie_10_2_cat_and_shuffle(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)

        # This test also adds file shuffling, to see that row order doesn't matter
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)
        glm_doit(self, csvFilename, csvPathname, timeoutSecs=30)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)

        filename1xShuf = "hastie_1x.data_shuf"
        pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf
        h2o_util.file_shuffle(pathname1x, pathname1xShuf)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x)

        filename2xShuf = "hastie_2x.data_shuf"
        pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf
        h2o_util.file_shuffle(pathname2x, pathname2xShuf)
        glm_doit(self, filename2xShuf, pathname2xShuf, timeoutSecs=45)

        # too big to shuffle?
        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x)
        glm_doit(self, filename4x, pathname4x, timeoutSecs=120)
    def test_1mx10_hastie_10_2_cat_and_shuffle(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)

        # This test also adds file shuffling, to see that row order doesn't matter
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)
        kmeans_doit(self, csvFilename, csvPathname, num_rows=1000000, timeoutSecs=60)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)
        
        filename1xShuf = "hastie_1x.data_shuf"
        pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf
        h2o_util.file_shuffle(pathname1x, pathname1xShuf)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x)

        filename2xShuf = "hastie_2x.data_shuf"
        pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf
        h2o_util.file_shuffle(pathname2x, pathname2xShuf)
        kmeans_doit(self, filename2xShuf, pathname2xShuf, num_rows=2000000, timeoutSecs=90)

        # too big to shuffle?
        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x)
        kmeans_doit(self, filename4x, pathname4x, num_rows=4000000, timeoutSecs=120)
    def test_1mx10_hastie_10_2_cat_and_shuffle(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)

        # This test also adds file shuffling, to see that row order doesn't matter
        csvFilename = "1mx10_hastie_10_2.data.gz"
        bucket = 'datasets'
        csvPathname = 'logreg' + '/' + csvFilename
        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

        glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)
        
        filename1xShuf = "hastie_1x.data_shuf"
        pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf
        h2o_util.file_shuffle(pathname1x, pathname1xShuf)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x)

        filename2xShuf = "hastie_2x.data_shuf"
        pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf
        h2o_util.file_shuffle(pathname2x, pathname2xShuf)
        glm_doit(self, filename2xShuf, None, pathname2xShuf, timeoutSecs=45)

        # too big to shuffle?
        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2xShuf,pathname2xShuf,pathname4x)
        glm_doit(self,filename4x, None, pathname4x, timeoutSecs=120)
Exemple #5
0
    def test_A_1mx10_hastie_10_2(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)
        glm_doit(self,csvFilename, csvPathname, timeoutSecs=30)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_doit(self,filename2x, pathname2x, timeoutSecs=45)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2x,pathname2x,pathname4x)
        
        print "Iterating 3 times on this last one for perf compare"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_doit(self,filename4x, pathname4x, timeoutSecs=60)
    def test_GLM_hastie_shuffle(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)

        # This test also adds file shuffling, to see that row order doesn't matter
        csvFilename = "1mx10_hastie_10_2.data.gz"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard' + '/' + csvFilename
        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

        glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename1xShuf = "hastie_1x.data_shuf"
        pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf
        h2o_util.file_shuffle(pathname1x, pathname1xShuf)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x)

        filename2xShuf = "hastie_2x.data_shuf"
        pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf
        h2o_util.file_shuffle(pathname2x, pathname2xShuf)
        glm_doit(self, filename2xShuf, None, pathname2xShuf, timeoutSecs=45)

        # too big to shuffle?
        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2xShuf,pathname2xShuf,pathname4x)
        glm_doit(self,filename4x, None, pathname4x, timeoutSecs=120)
Exemple #7
0
    def test_GLM_hastie(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        bucket = 'home-0xdiag-datasets'
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'standard' + '/' + csvFilename
        glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=75)
        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_doit(self,filename2x, None, pathname2x, timeoutSecs=75)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2x,pathname2x,pathname4x)
        
        print "Iterating 3 times on this last one for perf compare"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_doit(self, filename4x, None, pathname4x, timeoutSecs=150)
Exemple #8
0
    def test_A_1mx10_hastie_10_2(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)
        glm_doit(self,csvFilename, csvPathname, timeoutSecs=75)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_doit(self,filename2x, pathname2x, timeoutSecs=75)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2x,pathname2x,pathname4x)
        
        print "Iterating 3 times on this last one for perf compare"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_doit(self,filename4x, pathname4x, timeoutSecs=150)
Exemple #9
0
    def test_A_1mx10_hastie_10_2(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)

        y = "10"
        x = ""
        kwargs = {'x': x, 'y':  y, 'case': -1, 'thresholds': 0.5}
        (modelKey, validations1) = glm_doit(self, csvFilename, csvPathname, 
            timeoutSecs=60, pollTimeoutSecs=60, **kwargs)

        print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x"

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_score(self,filename2x, pathname2x, modelKey, thresholds="0.5",
            timeoutSecs=60, pollTimeoutSecs=60)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2x,pathname2x,pathname4x)
        
        print "Iterating 3 times on this last one"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_score(self,filename4x, pathname4x, modelKey, thresholds="0.5",
                timeoutSecs=60, pollTimeoutSecs=60)
Exemple #10
0
    def test_GLM2_score_same(self):
        h2o.beta_features = True
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        bucket = 'home-0xdiag-datasets'
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'standard' + '/' + csvFilename

        y = "10"
        kwargs = {'response':  y, 'alpha': 0, 'family': 'gaussian'}
        (modelKey, validation1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, 
            timeoutSecs=60, pollTimeoutSecs=60, **kwargs) 
        print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x"

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x

        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        modelPathname = SYNDATASETS_DIR + '/model_' + filename2x
        bucket = None
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_score(self,filename2x, bucket, pathname2x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        modelPathname = SYNDATASETS_DIR + '/model_' + filename4x
        h2o_util.file_cat(pathname2x, pathname2x, pathname4x)
        
        print "Iterating 3 times on this last one"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_score(self,filename4x, bucket, pathname4x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)
    def test_KMeans_hastie_shuffle_fvec(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)

        # This test also adds file shuffling, to see that row order doesn't matter
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'standard/' + csvFilename
        bucket = 'home-0xdiag-datasets'
        kmeans_doit(self, csvFilename, bucket, csvPathname, numRows=1000000, timeoutSecs=60)
        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)
        
        filename1xShuf = "hastie_1x.data_shuf"
        pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf
        h2o_util.file_shuffle(pathname1x, pathname1xShuf)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x)

        filename2xShuf = "hastie_2x.data_shuf"
        pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf
        h2o_util.file_shuffle(pathname2x, pathname2xShuf)
        kmeans_doit(self, filename2xShuf, None, pathname2xShuf, numRows=2000000, timeoutSecs=90)

        # too big to shuffle?
        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x)
        kmeans_doit(self, filename4x, None, pathname4x, numRows=4000000, timeoutSecs=120)
Exemple #12
0
    def test_GLM2_score_same(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        bucket = 'home-0xdiag-datasets'
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'standard' + '/' + csvFilename

        y = "10"
        kwargs = {'response':  y, 'alpha': 0, 'family': 'gaussian'}
        (modelKey, validation1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, 
            timeoutSecs=60, pollTimeoutSecs=60, **kwargs) 
        print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x"

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x

        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        modelPathname = SYNDATASETS_DIR + '/model_' + filename2x
        bucket = None
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_score(self,filename2x, bucket, pathname2x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        modelPathname = SYNDATASETS_DIR + '/model_' + filename4x
        h2o_util.file_cat(pathname2x, pathname2x, pathname4x)
        
        print "Iterating 3 times on this last one"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_score(self,filename4x, bucket, pathname4x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)
def write_syn_dataset(csvPathname,
                      rowCount,
                      inCount=1,
                      outCount=1,
                      SEED='12345678',
                      colSepChar=",",
                      rowSepChar="\n",
                      quoteChars="",
                      colEnumList=None):
    r1 = random.Random(SEED)

    if CAT_ITERATE == 0:
        dsf = open(csvPathname, "w+")
    else:
        tmpFd, tmpPathname = h2o.tmp_file("cat", ".csv", tmp_dir="/tmp")
        dsf = open(tmpPathname, "w+")

    global WRITE_REPEAT
    if not (WRITE_REPEAT >= 1 and WRITE_REPEAT <= 100):
        print "Forcing WRITE_REPEAT to 1"
        WRITE_REPEAT = 1
    for row in range(rowCount):
        if ((WRITE_REPEAT * row) % 100000) == 0:
            print "Wrote", WRITE_REPEAT * row, "lines"

        # doesn't guarantee that 10000 rows have 10000 unique enums in a column
        # essentially sampling with replacement
        rowData = []
        for iCol in range(inCount):
            # FIX! we should add some random NA?
            ri = random.choice(colEnumList[iCol])
            rowData.append(ri)

        # output columns. always 0-10e6 with 2 digits of fp precision
        for oCol in range(outCount):
            ri = "%.2f" % random.uniform(0, 10e6)
            rowData.append(ri)

        # use the new Hive separator
        rowDataCsv = colSepChar.join(map(str, rowData)) + rowSepChar
        ### sys.stdout.write(rowDataCsv)
        # faster for creating big files? doesn't need to be fully random
        for i in range(WRITE_REPEAT):
            dsf.write(rowDataCsv)
    dsf.close()

    if CAT_ITERATE > 0:
        for c in range(CAT_ITERATE + 1):
            if c == CAT_ITERATE:
                print "Doubling", tmpPathname, "into", csvPathname
                h2o_util.file_cat(tmpPathname, tmpPathname, csvPathname)
            else:
                tmp2Fd, tmp2Pathname = h2o.tmp_file()
                print "Doubling", tmpPathname, "into", tmp2Pathname
                h2o_util.file_cat(tmpPathname, tmpPathname, tmp2Pathname)
                tmpPathname = tmp2Pathname

    return colEnumList
Exemple #14
0
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, key2, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs):
    csvPathname = SYNDATASETS_DIR + '/' + csvFilename
    print "Creating random", csvPathname
    write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

    csvFilenamegz = csvFilename + ".gz"
    csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
    h2o_util.file_gzip(csvPathname, csvPathnamegz)

    csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz"
    csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz
    print "Replicating", csvFilenamegz, "into", csvFilenameReplgz

    start = time.time()
    h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz)
    # no header? should we add a header? would have to be a separate gz?
    totalRows = 2 * rowCount
    for i in range(FILEREPL-2):
        h2o_util.file_append(csvPathnamegz, csvPathnameReplgz)
        totalRows += rowCount
    print "Replication took:", time.time() - start, "seconds"

    start = time.time()
    print "Parse start:", csvPathnameReplgz
    doSummary = False
    parseKey = h2o_cmd.parseFile(None, csvPathnameReplgz, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=doSummary)
    print csvFilenameReplgz, 'parse time:', parseKey['response']['time']
    if doSummary:
        algo = "Parse and Summary:"
    else:
        algo = "Parse:"
    print algo , parseKey['destination_key'], "took", time.time() - start, "seconds"

    print "Inspecting.."
    start = time.time()
    inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs)
    print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds"
    h2o_cmd.infoFromInspect(inspect, csvPathname)
    print "\n" + csvPathname, \
        "    num_rows:", "{:,}".format(inspect['num_rows']), \
        "    num_cols:", "{:,}".format(inspect['num_cols'])

    # there is an extra response variable
    if inspect['num_cols'] != (colCount + 1):
        raise Exception("parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount))
    if inspect['num_rows'] != totalRows:
        raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
        (inspect['num_rows'], rowCount))

    # hack it in! for test purposees only
    parseKey['python_source_key'] = csvFilenameReplgz
    parseKey['num_rows'] = inspect['num_rows']
    parseKey['num_cols'] = inspect['num_cols']
    parseKey['value_size_bytes'] = inspect['value_size_bytes']
    return parseKey
Exemple #15
0
    def test_exec2_sum(self):
        h2o.beta_features = True
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets',
                                                  'standard/covtype.data',
                                                  returnFullPath=True)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5, 1),
            (pathname2x, "cB", 5, 2),
            (pathname2x, "cC", 5, 2),
        ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll:
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=2000)
            print "Parse result['Key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvPathname
            h2o_exec.exec_zero_list(zeroList)
            colResultList = h2o_exec.exec_expr_list_across_cols(
                lenNodes,
                exprList,
                hex_key,
                maxCol=54,
                timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
def write_syn_dataset(csvPathname, rowCount, inCount=1, outCount=1, SEED='12345678', 
        colSepChar=",", rowSepChar="\n", quoteChars="", colEnumList=None):
    r1 = random.Random(SEED)

    if CAT_ITERATE==0:
        dsf = open(csvPathname, "w+")
    else:
        tmpFd, tmpPathname = h2o.tmp_file("cat",".csv", tmp_dir="/tmp")
        dsf = open(tmpPathname, "w+")

    global WRITE_REPEAT
    if not (WRITE_REPEAT>=1 and WRITE_REPEAT <=100):
        print "Forcing WRITE_REPEAT to 1"
        WRITE_REPEAT = 1
    for row in range(rowCount):
        if ((WRITE_REPEAT*row) % 100000)== 0:
            print "Wrote", WRITE_REPEAT*row, "lines"
                
        # doesn't guarantee that 10000 rows have 10000 unique enums in a column
        # essentially sampling with replacement
        rowData = []
        for iCol in range(inCount):
            # FIX! we should add some random NA?
            ri = random.choice(colEnumList[iCol])
            rowData.append(ri)

        # output columns. always 0-10e6 with 2 digits of fp precision
        for oCol in range(outCount):
            ri = "%.2f" % random.uniform(0, 10e6)
            rowData.append(ri)

        # use the new Hive separator
        rowDataCsv = colSepChar.join(map(str,rowData)) + rowSepChar
        ### sys.stdout.write(rowDataCsv)
        # faster for creating big files? doesn't need to be fully random
        for i in range(WRITE_REPEAT):
            dsf.write(rowDataCsv)
    dsf.close()

    if CAT_ITERATE > 0:
        for c in range(CAT_ITERATE+1):
            if c==CAT_ITERATE:
                print "Doubling", tmpPathname, "into", csvPathname
                h2o_util.file_cat(tmpPathname, tmpPathname, csvPathname)
            else:
                tmp2Fd, tmp2Pathname = h2o.tmp_file()
                print "Doubling", tmpPathname, "into", tmp2Pathname
                h2o_util.file_cat(tmpPathname, tmpPathname, tmp2Pathname)
                tmpPathname = tmp2Pathname

    return colEnumList
Exemple #17
0
    def test_exec2_sum(self):
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets',
                                                  'standard/covtype.data',
                                                  returnFullPath=True)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5, 1),
            (pathname2x, "cB", 5, 2),
            (pathname2x, "cC", 5, 2),
        ]

        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll:
            parseResultA = h2i.import_parse(path=csvPathname,
                                            schema='put',
                                            hex_key=hex_key)
            pA = h2o_cmd.ParseObj(parseResultA)
            print pA.numRows
            print pA.numCols
            print pA.parse_key
            iA = h2o_cmd.InspectObj(pA.parse_key)

            k = Key(hex_key)
            colResultList = []
            for i in range(pA.numCols):
                result = Expr(Fcn('sum', k[:, i], True)).result
                colResultList.append(result)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
Exemple #18
0
    def test_A_1mx10_hastie_10_2(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)
        glm_doit(self,csvFilename, csvPathname, timeoutSecs=300)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_doit(self,filename2x, pathname2x, timeoutSecs=300)
Exemple #19
0
    def test_A_1mx10_hastie_10_2(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename)
        glm_doit(self, csvFilename, csvPathname, timeoutSecs=300)

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(csvPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)
        glm_doit(self, filename2x, pathname2x, timeoutSecs=300)
Exemple #20
0
    def test_sum(self):
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2o.find_dataset('UCI/UCI-large/covtype' + '/' +
                                      filename1x)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5, 1),
            (pathname2x, "cB", 5, 2),
            (pathname2x, "cC", 5, 2),
        ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, key2, timeoutSecs, resultMult) in csvAll:
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                         key2=key2,
                                         timeoutSecs=2000)
            print "Parse result['Key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

            print "\n" + csvPathname
            h2o_exec.exec_zero_list(zeroList)
            colResultList = h2o_exec.exec_expr_list_across_cols(
                lenNodes, exprList, key2, maxCol=54, timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0]
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x) / resultMult for x in colResultList]
                print "\n", good, "\n", compare
                self.assertEqual(
                    good, compare,
                    'compare is not equal to good (first try * resultMult)')
Exemple #21
0
    def test_A_1mx10_hastie_10_2(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        bucket = 'home-0xdiag-datasets'
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'standard' + '/' + csvFilename
        glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=300)

        fullPathname = h2i.find_folder_and_filename('home-0xdiag-datasets', csvPathname, returnFullPath=True)
        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_doit(self, filename2x, None, pathname2x, timeoutSecs=300)
Exemple #22
0
    def test_A_1mx10_hastie_10_2(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        # FIX! eventually we'll compare the 1x, 2x and 4x results like we do
        # in other tests. (catdata?)
        bucket = 'datasets'
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'logreg' + '/' + csvFilename
        glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=300)

        fullPathname = h2i.find_folder_and_filename('datasets', csvPathname, returnFullPath=True)
        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_doit(self, filename2x, None, pathname2x, timeoutSecs=300)
    def test_exec2_sum(self):
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets', 'standard/covtype.data', returnFullPath=True)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5,  1),
            (pathname2x, "cB", 5,  2),
            (pathname2x, "cC", 5,  2),
        ]

        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll:
            parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            pA = h2o_cmd.ParseObj(parseResultA)
            print pA.numRows
            print pA.numCols
            print pA.parse_key
            iA = h2o_cmd.InspectObj(pA.parse_key)

            k = Key(hex_key)
            colResultList = []
            for i in range(pA.numCols):
                result = Expr(Fcn('sum', k[:,i], True)).result
                colResultList.append(result)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0] 
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x)/resultMult for x in colResultList] 
                print "\n", good, "\n", compare
                self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
Exemple #24
0
    def test_exec2_sum(self):
        h2o.beta_features = True
        print "Replicating covtype.data by 2x for results comparison to 1x"
        filename1x = 'covtype.data'
        pathname1x = h2i.find_folder_and_filename('datasets', 'UCI/UCI-large/covtype/covtype.data', returnFullPath=True)
        filename2x = "covtype_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        h2o_util.file_cat(pathname1x, pathname1x, pathname2x)

        csvAll = [
            (pathname1x, "cA", 5,  1),
            (pathname2x, "cB", 5,  2),
            (pathname2x, "cC", 5,  2),
        ]

        h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll:
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000)
            print "Parse result['Key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])

            print "\n" + csvPathname
            h2o_exec.exec_zero_list(zeroList)
            colResultList = h2o_exec.exec_expr_list_across_cols(lenNodes, exprList, hex_key, maxCol=54, 
                timeoutSecs=timeoutSecs)
            print "\ncolResultList", colResultList

            if not firstDone:
                colResultList0 = list(colResultList)
                good = [float(x) for x in colResultList0] 
                firstDone = True
            else:
                print "\n", colResultList0, "\n", colResultList
                # create the expected answer...i.e. N * first
                compare = [float(x)/resultMult for x in colResultList] 
                print "\n", good, "\n", compare
                self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
    def test_GLM_score_same(self):
        # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR
        bucket = 'datasets'
        csvFilename = "1mx10_hastie_10_2.data.gz"
        csvPathname = 'logreg' + '/' + csvFilename

        y = "10"
        x = ""
        kwargs = {'x': x, 'y':  y, 'case': -1, 'thresholds': 0.5}
        (modelKey, validations1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, 
            timeoutSecs=60, pollTimeoutSecs=60, **kwargs)

        print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x"

        filename1x = "hastie_1x.data"
        pathname1x = SYNDATASETS_DIR + '/' + filename1x

        fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)
        h2o_util.file_gunzip(fullPathname, pathname1x)

        filename2x = "hastie_2x.data"
        pathname2x = SYNDATASETS_DIR + '/' + filename2x
        bucket = None
        h2o_util.file_cat(pathname1x,pathname1x,pathname2x)
        glm_score(self,filename2x, bucket, pathname2x, modelKey, thresholds="0.5",
            timeoutSecs=60, pollTimeoutSecs=60)

        filename4x = "hastie_4x.data"
        pathname4x = SYNDATASETS_DIR + '/' + filename4x
        h2o_util.file_cat(pathname2x, pathname2x, pathname4x)
        
        print "Iterating 3 times on this last one"
        for i in range(3):
            print "\nTrial #", i, "of", filename4x
            glm_score(self,filename4x, bucket, pathname4x, modelKey, thresholds="0.5",
                timeoutSecs=60, pollTimeoutSecs=60)
Exemple #26
0
    def test_parse_syn_gz_cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # summary fails with 100000 cols
            (10, 5000, 'cE', 600),
            (10, 10000, 'cF', 600),
            (10, 50000, 'cF', 600),
        ]

        FILEREPL = 200
        DOSUMMARY = True
        # h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            csvFilenamegz = csvFilename + ".gz"
            csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
            h2o_util.file_gzip(csvPathname, csvPathnamegz)

            csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz"
            csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz

            start = time.time()
            print "Replicating", csvFilenamegz, "into", csvFilenameReplgz
            h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz)
            # no header? should we add a header? would have to be a separate gz?
            totalRows = 2 * rowCount
            for i in range(FILEREPL - 2):
                h2o_util.file_append(csvPathnamegz, csvPathnameReplgz)
                totalRows += rowCount
            print "Replication took:", time.time() - start, "seconds"

            start = time.time()
            print "Parse start:", csvPathnameReplgz
            parseKey = h2o_cmd.parseFile(None,
                                         csvPathnameReplgz,
                                         key2=key2,
                                         timeoutSecs=timeoutSecs,
                                         doSummary=DOSUMMARY)
            print csvFilenameReplgz, 'parse time:', parseKey['response'][
                'time']
            if DOSUMMARY:
                algo = "Parse and Summary:"
            else:
                algo = "Parse:"
            print algo, parseKey['destination_key'], "took", time.time(
            ) - start, "seconds"

            print "Inspecting.."
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseKey['destination_key'],
                                         timeoutSecs=timeoutSecs)
            print "Inspect:", parseKey['destination_key'], "took", time.time(
            ) - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect['num_cols'], colCount,
                "parse created result with the wrong number of cols %s %s" %
                (inspect['num_cols'], colCount))
            self.assertEqual(inspect['num_rows'], totalRows,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['num_rows'], rowCount))
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, hex_key, rowCount,
                             colCount, FILEREPL, SEEDPERFILE, timeoutSecs):
    csvPathname = SYNDATASETS_DIR + '/' + csvFilename
    print "Creating random", csvPathname
    write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

    csvFilenamegz = csvFilename + ".gz"
    csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
    h2o_util.file_gzip(csvPathname, csvPathnamegz)

    csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz"
    csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz
    print "Replicating", csvFilenamegz, "into", csvFilenameReplgz

    start = time.time()
    h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz)
    # no header? should we add a header? would have to be a separate gz?
    totalRows = 2 * rowCount
    for i in range(FILEREPL - 2):
        h2o_util.file_append(csvPathnamegz, csvPathnameReplgz)
        totalRows += rowCount
    print "Replication took:", time.time() - start, "seconds"

    start = time.time()
    print "Parse start:", csvPathnameReplgz

    # experiment to see if the gz is causing it to fail
    if NO_GZ:
        csvPathnameReplgz = csvPathname
        totalRows = rowCount
    # hack experiment
    if NO_REPL:
        h2o_util.file_gzip(csvPathname, csvPathnameReplgz)
        totalRows = rowCount

    parseResult = h2i.import_parse(path=csvPathnameReplgz,
                                   schema='put',
                                   hex_key=hex_key,
                                   timeoutSecs=timeoutSecs,
                                   pollTimeoutSecs=120,
                                   doSummary=DO_SUMMARY,
                                   blocking=DO_BLOCKING)

    if DO_SUMMARY:
        algo = "Parse and Summary:"
    else:
        algo = "Parse:"
    print algo, parseResult['destination_key'], "took", time.time(
    ) - start, "seconds"

    print "Inspecting.."
    time.sleep(5)
    start = time.time()
    inspect = h2o_cmd.runInspect(key=parseResult['destination_key'],
                                 timeoutSecs=timeoutSecs)
    print "Inspect:", parseResult['destination_key'], "took", time.time(
    ) - start, "seconds"
    h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz)
    print "\n" + csvPathnameReplgz, \
        "    numRows:", "{:,}".format(inspect['numRows']), \
        "    numCols:", "{:,}".format(inspect['numCols'])

    # there is an extra response variable
    if inspect['numCols'] != (colCount + 1):
        raise Exception(
            "parse created result with the wrong number of cols %s %s" %
            (inspect['numCols'], colCount))
    if inspect['numRows'] != totalRows:
        raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
        (inspect['numRows'], totalRows))

    # hack it in! for test purposees only
    parseResult['numRows'] = inspect['numRows']
    parseResult['numCols'] = inspect['numCols']
    parseResult['byteSize'] = inspect['byteSize']
    return parseResult
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, hex_key, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs):
    csvPathname = SYNDATASETS_DIR + '/' + csvFilename
    print "Creating random", csvPathname
    write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

    csvFilenamegz = csvFilename + ".gz"
    csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
    h2o_util.file_gzip(csvPathname, csvPathnamegz)

    csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz"
    csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz
    print "Replicating", csvFilenamegz, "into", csvFilenameReplgz

    start = time.time()
    h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz)
    # no header? should we add a header? would have to be a separate gz?
    totalRows = 2 * rowCount
    for i in range(FILEREPL-2):
        h2o_util.file_append(csvPathnamegz, csvPathnameReplgz)
        totalRows += rowCount
    print "Replication took:", time.time() - start, "seconds"

    start = time.time()
    print "Parse start:", csvPathnameReplgz

    # experiment to see if the gz is causing it to fail 
    if NO_GZ:
        csvPathnameReplgz = csvPathname
        totalRows = rowCount
    # hack experiment
    if NO_REPL:
        h2o_util.file_gzip(csvPathname, csvPathnameReplgz)
        totalRows = rowCount

    parseResult = h2i.import_parse(path=csvPathnameReplgz, schema='put', hex_key=hex_key, 
        timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=DO_SUMMARY, blocking=DO_BLOCKING)

    if DO_SUMMARY:
        algo = "Parse and Summary:"
    else:
        algo = "Parse:"
    print algo , parseResult['destination_key'], "took", time.time() - start, "seconds"

    print "Inspecting.."
    time.sleep(5)
    start = time.time()
    inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
    print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
    h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz)
    print "\n" + csvPathnameReplgz, \
        "    numRows:", "{:,}".format(inspect['numRows']), \
        "    numCols:", "{:,}".format(inspect['numCols'])

    # there is an extra response variable
    if inspect['numCols'] != (colCount + 1):
        raise Exception("parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount))
    if inspect['numRows'] != totalRows:
        raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
        (inspect['numRows'], totalRows))

    # hack it in! for test purposees only
    parseResult['numRows'] = inspect['numRows']
    parseResult['numCols'] = inspect['numCols']
    parseResult['byteSize'] = inspect['byteSize']
    return parseResult
    def test_parse_syn_gz_cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # summary fails with 100000 cols
            # overwrite the key each time to save space?
            (100, 40000, 'cF', 600),
            (100, 20000, 'cF', 600),
            (100, 10000, 'cF', 600),
            (100, 5000, 'cF', 600),
            ]

        FILEREPL = 200
        DOSUMMARY = True
        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            csvFilenamegz = csvFilename + ".gz"
            csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
            h2o_util.file_gzip(csvPathname, csvPathnamegz)

            csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz"
            csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz

            start = time.time()
            print "Replicating", csvFilenamegz, "into", csvFilenameReplgz
            h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz)
            # no header? should we add a header? would have to be a separate gz?
            totalRows = 2 * rowCount
            for i in range(FILEREPL-2):
                h2o_util.file_append(csvPathnamegz, csvPathnameReplgz)
                totalRows += rowCount
            print "Replication took:", time.time() - start, "seconds"

            start = time.time()
            print "Parse start:", csvPathnameReplgz
            parseResult = h2i.import_parse(path=csvPathnameReplgz, schema='put', hex_key=hex_key, 
                timeoutSecs=timeoutSecs, doSummary=DOSUMMARY)
            print csvFilenameReplgz, 'parse time:', parseResult['response']['time']
            if DOSUMMARY:
                algo = "Parse and Summary:"
            else:
                algo = "Parse:"
            print algo , parseResult['destination_key'], "took", time.time() - start, "seconds"

            print "Inspecting.."
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']
            value_size_bytes = inspect['value_size_bytes']
            h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz)
            print "\n" + csvPathnameReplgz, \
                "\n    num_rows:", "{:,}".format(num_rows), \
                "\n    num_cols:", "{:,}".format(num_cols), \
                "\n    value_size_bytes:", "{:,}".format(value_size_bytes)

            # should match # of cols in header or ??
            self.assertEqual(inspect['num_cols'], colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount))
            self.assertEqual(inspect['num_rows'], totalRows,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['num_rows'], rowCount))
Exemple #30
0
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, key2, rowCount,
                             colCount, FILEREPL, SEEDPERFILE, timeoutSecs):
    csvPathname = SYNDATASETS_DIR + '/' + csvFilename
    print "Creating random", csvPathname
    write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

    csvFilenamegz = csvFilename + ".gz"
    csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
    h2o_util.file_gzip(csvPathname, csvPathnamegz)

    csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz"
    csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz
    print "Replicating", csvFilenamegz, "into", csvFilenameReplgz

    start = time.time()
    h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz)
    # no header? should we add a header? would have to be a separate gz?
    totalRows = 2 * rowCount
    for i in range(FILEREPL - 2):
        h2o_util.file_append(csvPathnamegz, csvPathnameReplgz)
        totalRows += rowCount
    print "Replication took:", time.time() - start, "seconds"

    start = time.time()
    print "Parse start:", csvPathnameReplgz
    doSummary = False
    parseKey = h2o_cmd.parseFile(None,
                                 csvPathnameReplgz,
                                 key2=key2,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=120,
                                 doSummary=doSummary)
    print csvFilenameReplgz, 'parse time:', parseKey['response']['time']
    if doSummary:
        algo = "Parse and Summary:"
    else:
        algo = "Parse:"
    print algo, parseKey['destination_key'], "took", time.time(
    ) - start, "seconds"

    print "Inspecting.."
    start = time.time()
    inspect = h2o_cmd.runInspect(None,
                                 parseKey['destination_key'],
                                 timeoutSecs=timeoutSecs)
    print "Inspect:", parseKey['destination_key'], "took", time.time(
    ) - start, "seconds"
    h2o_cmd.infoFromInspect(inspect, csvPathname)
    print "\n" + csvPathname, \
        "    num_rows:", "{:,}".format(inspect['num_rows']), \
        "    num_cols:", "{:,}".format(inspect['num_cols'])

    # there is an extra response variable
    if inspect['num_cols'] != (colCount + 1):
        raise Exception(
            "parse created result with the wrong number of cols %s %s" %
            (inspect['num_cols'], colCount))
    if inspect['num_rows'] != totalRows:
        raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
        (inspect['num_rows'], rowCount))

    # hack it in! for test purposees only
    parseKey['python_source_key'] = csvFilenameReplgz
    parseKey['num_rows'] = inspect['num_rows']
    parseKey['num_cols'] = inspect['num_cols']
    parseKey['value_size_bytes'] = inspect['value_size_bytes']
    return parseKey