Exemple #1
0
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, key2, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs):
    csvPathname = SYNDATASETS_DIR + '/' + csvFilename
    print "Creating random", csvPathname
    write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

    csvFilenamegz = csvFilename + ".gz"
    csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
    h2o_util.file_gzip(csvPathname, csvPathnamegz)

    csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz"
    csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz
    print "Replicating", csvFilenamegz, "into", csvFilenameReplgz

    start = time.time()
    h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz)
    # no header? should we add a header? would have to be a separate gz?
    totalRows = 2 * rowCount
    for i in range(FILEREPL-2):
        h2o_util.file_append(csvPathnamegz, csvPathnameReplgz)
        totalRows += rowCount
    print "Replication took:", time.time() - start, "seconds"

    start = time.time()
    print "Parse start:", csvPathnameReplgz
    doSummary = False
    parseKey = h2o_cmd.parseFile(None, csvPathnameReplgz, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=doSummary)
    print csvFilenameReplgz, 'parse time:', parseKey['response']['time']
    if doSummary:
        algo = "Parse and Summary:"
    else:
        algo = "Parse:"
    print algo , parseKey['destination_key'], "took", time.time() - start, "seconds"

    print "Inspecting.."
    start = time.time()
    inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs)
    print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds"
    h2o_cmd.infoFromInspect(inspect, csvPathname)
    print "\n" + csvPathname, \
        "    num_rows:", "{:,}".format(inspect['num_rows']), \
        "    num_cols:", "{:,}".format(inspect['num_cols'])

    # there is an extra response variable
    if inspect['num_cols'] != (colCount + 1):
        raise Exception("parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount))
    if inspect['num_rows'] != totalRows:
        raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
        (inspect['num_rows'], rowCount))

    # hack it in! for test purposees only
    parseKey['python_source_key'] = csvFilenameReplgz
    parseKey['num_rows'] = inspect['num_rows']
    parseKey['num_cols'] = inspect['num_cols']
    parseKey['value_size_bytes'] = inspect['value_size_bytes']
    return parseKey
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, hex_key, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs):
    csvPathname = SYNDATASETS_DIR + '/' + csvFilename
    print "Creating random", csvPathname
    write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

    csvFilenamegz = csvFilename + ".gz"
    csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
    h2o_util.file_gzip(csvPathname, csvPathnamegz)

    csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz"
    csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz
    print "Replicating", csvFilenamegz, "into", csvFilenameReplgz

    start = time.time()
    h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz)
    # no header? should we add a header? would have to be a separate gz?
    totalRows = 2 * rowCount
    for i in range(FILEREPL-2):
        h2o_util.file_append(csvPathnamegz, csvPathnameReplgz)
        totalRows += rowCount
    print "Replication took:", time.time() - start, "seconds"

    start = time.time()
    print "Parse start:", csvPathnameReplgz

    # experiment to see if the gz is causing it to fail 
    if NO_GZ:
        csvPathnameReplgz = csvPathname
        totalRows = rowCount
    # hack experiment
    if NO_REPL:
        h2o_util.file_gzip(csvPathname, csvPathnameReplgz)
        totalRows = rowCount

    parseResult = h2i.import_parse(path=csvPathnameReplgz, schema='put', hex_key=hex_key, 
        timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=DO_SUMMARY, blocking=DO_BLOCKING)

    if DO_SUMMARY:
        algo = "Parse and Summary:"
    else:
        algo = "Parse:"
    print algo , parseResult['destination_key'], "took", time.time() - start, "seconds"

    print "Inspecting.."
    time.sleep(5)
    start = time.time()
    inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=timeoutSecs)
    print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
    h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz)
    print "\n" + csvPathnameReplgz, \
        "    numRows:", "{:,}".format(inspect['numRows']), \
        "    numCols:", "{:,}".format(inspect['numCols'])

    # there is an extra response variable
    if inspect['numCols'] != (colCount + 1):
        raise Exception("parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount))
    if inspect['numRows'] != totalRows:
        raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
        (inspect['numRows'], totalRows))

    # hack it in! for test purposees only
    parseResult['numRows'] = inspect['numRows']
    parseResult['numCols'] = inspect['numCols']
    parseResult['byteSize'] = inspect['byteSize']
    return parseResult
Exemple #3
0
    def test_parse_syn_gz_cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # summary fails with 100000 cols
            (10, 5000, 'cE', 600),
            (10, 10000, 'cF', 600),
            (10, 50000, 'cF', 600),
        ]

        FILEREPL = 200
        DOSUMMARY = True
        # h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            csvFilenamegz = csvFilename + ".gz"
            csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
            h2o_util.file_gzip(csvPathname, csvPathnamegz)

            csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz"
            csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz

            start = time.time()
            print "Replicating", csvFilenamegz, "into", csvFilenameReplgz
            h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz)
            # no header? should we add a header? would have to be a separate gz?
            totalRows = 2 * rowCount
            for i in range(FILEREPL - 2):
                h2o_util.file_append(csvPathnamegz, csvPathnameReplgz)
                totalRows += rowCount
            print "Replication took:", time.time() - start, "seconds"

            start = time.time()
            print "Parse start:", csvPathnameReplgz
            parseKey = h2o_cmd.parseFile(None,
                                         csvPathnameReplgz,
                                         key2=key2,
                                         timeoutSecs=timeoutSecs,
                                         doSummary=DOSUMMARY)
            print csvFilenameReplgz, 'parse time:', parseKey['response'][
                'time']
            if DOSUMMARY:
                algo = "Parse and Summary:"
            else:
                algo = "Parse:"
            print algo, parseKey['destination_key'], "took", time.time(
            ) - start, "seconds"

            print "Inspecting.."
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseKey['destination_key'],
                                         timeoutSecs=timeoutSecs)
            print "Inspect:", parseKey['destination_key'], "took", time.time(
            ) - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect['num_cols'], colCount,
                "parse created result with the wrong number of cols %s %s" %
                (inspect['num_cols'], colCount))
            self.assertEqual(inspect['num_rows'], totalRows,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['num_rows'], rowCount))
    def test_parse_syn_gz_cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # summary fails with 100000 cols
            # overwrite the key each time to save space?
            (100, 40000, 'cF', 600),
            (100, 20000, 'cF', 600),
            (100, 10000, 'cF', 600),
            (100, 5000, 'cF', 600),
            ]

        FILEREPL = 200
        DOSUMMARY = True
        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            csvFilenamegz = csvFilename + ".gz"
            csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
            h2o_util.file_gzip(csvPathname, csvPathnamegz)

            csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz"
            csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz

            start = time.time()
            print "Replicating", csvFilenamegz, "into", csvFilenameReplgz
            h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz)
            # no header? should we add a header? would have to be a separate gz?
            totalRows = 2 * rowCount
            for i in range(FILEREPL-2):
                h2o_util.file_append(csvPathnamegz, csvPathnameReplgz)
                totalRows += rowCount
            print "Replication took:", time.time() - start, "seconds"

            start = time.time()
            print "Parse start:", csvPathnameReplgz
            parseResult = h2i.import_parse(path=csvPathnameReplgz, schema='put', hex_key=hex_key, 
                timeoutSecs=timeoutSecs, doSummary=DOSUMMARY)
            print csvFilenameReplgz, 'parse time:', parseResult['response']['time']
            if DOSUMMARY:
                algo = "Parse and Summary:"
            else:
                algo = "Parse:"
            print algo , parseResult['destination_key'], "took", time.time() - start, "seconds"

            print "Inspecting.."
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']
            value_size_bytes = inspect['value_size_bytes']
            h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz)
            print "\n" + csvPathnameReplgz, \
                "\n    num_rows:", "{:,}".format(num_rows), \
                "\n    num_cols:", "{:,}".format(num_cols), \
                "\n    value_size_bytes:", "{:,}".format(value_size_bytes)

            # should match # of cols in header or ??
            self.assertEqual(inspect['num_cols'], colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount))
            self.assertEqual(inspect['num_rows'], totalRows,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['num_rows'], rowCount))
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, hex_key, rowCount,
                             colCount, FILEREPL, SEEDPERFILE, timeoutSecs):
    csvPathname = SYNDATASETS_DIR + '/' + csvFilename
    print "Creating random", csvPathname
    write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

    csvFilenamegz = csvFilename + ".gz"
    csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
    h2o_util.file_gzip(csvPathname, csvPathnamegz)

    csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz"
    csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz
    print "Replicating", csvFilenamegz, "into", csvFilenameReplgz

    start = time.time()
    h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz)
    # no header? should we add a header? would have to be a separate gz?
    totalRows = 2 * rowCount
    for i in range(FILEREPL - 2):
        h2o_util.file_append(csvPathnamegz, csvPathnameReplgz)
        totalRows += rowCount
    print "Replication took:", time.time() - start, "seconds"

    start = time.time()
    print "Parse start:", csvPathnameReplgz

    # experiment to see if the gz is causing it to fail
    if NO_GZ:
        csvPathnameReplgz = csvPathname
        totalRows = rowCount
    # hack experiment
    if NO_REPL:
        h2o_util.file_gzip(csvPathname, csvPathnameReplgz)
        totalRows = rowCount

    parseResult = h2i.import_parse(path=csvPathnameReplgz,
                                   schema='put',
                                   hex_key=hex_key,
                                   timeoutSecs=timeoutSecs,
                                   pollTimeoutSecs=120,
                                   doSummary=DO_SUMMARY,
                                   blocking=DO_BLOCKING)

    if DO_SUMMARY:
        algo = "Parse and Summary:"
    else:
        algo = "Parse:"
    print algo, parseResult['destination_key'], "took", time.time(
    ) - start, "seconds"

    print "Inspecting.."
    time.sleep(5)
    start = time.time()
    inspect = h2o_cmd.runInspect(key=parseResult['destination_key'],
                                 timeoutSecs=timeoutSecs)
    print "Inspect:", parseResult['destination_key'], "took", time.time(
    ) - start, "seconds"
    h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz)
    print "\n" + csvPathnameReplgz, \
        "    numRows:", "{:,}".format(inspect['numRows']), \
        "    numCols:", "{:,}".format(inspect['numCols'])

    # there is an extra response variable
    if inspect['numCols'] != (colCount + 1):
        raise Exception(
            "parse created result with the wrong number of cols %s %s" %
            (inspect['numCols'], colCount))
    if inspect['numRows'] != totalRows:
        raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
        (inspect['numRows'], totalRows))

    # hack it in! for test purposees only
    parseResult['numRows'] = inspect['numRows']
    parseResult['numCols'] = inspect['numCols']
    parseResult['byteSize'] = inspect['byteSize']
    return parseResult
Exemple #6
0
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, key2, rowCount,
                             colCount, FILEREPL, SEEDPERFILE, timeoutSecs):
    csvPathname = SYNDATASETS_DIR + '/' + csvFilename
    print "Creating random", csvPathname
    write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

    csvFilenamegz = csvFilename + ".gz"
    csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
    h2o_util.file_gzip(csvPathname, csvPathnamegz)

    csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz"
    csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz
    print "Replicating", csvFilenamegz, "into", csvFilenameReplgz

    start = time.time()
    h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz)
    # no header? should we add a header? would have to be a separate gz?
    totalRows = 2 * rowCount
    for i in range(FILEREPL - 2):
        h2o_util.file_append(csvPathnamegz, csvPathnameReplgz)
        totalRows += rowCount
    print "Replication took:", time.time() - start, "seconds"

    start = time.time()
    print "Parse start:", csvPathnameReplgz
    doSummary = False
    parseKey = h2o_cmd.parseFile(None,
                                 csvPathnameReplgz,
                                 key2=key2,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=120,
                                 doSummary=doSummary)
    print csvFilenameReplgz, 'parse time:', parseKey['response']['time']
    if doSummary:
        algo = "Parse and Summary:"
    else:
        algo = "Parse:"
    print algo, parseKey['destination_key'], "took", time.time(
    ) - start, "seconds"

    print "Inspecting.."
    start = time.time()
    inspect = h2o_cmd.runInspect(None,
                                 parseKey['destination_key'],
                                 timeoutSecs=timeoutSecs)
    print "Inspect:", parseKey['destination_key'], "took", time.time(
    ) - start, "seconds"
    h2o_cmd.infoFromInspect(inspect, csvPathname)
    print "\n" + csvPathname, \
        "    num_rows:", "{:,}".format(inspect['num_rows']), \
        "    num_cols:", "{:,}".format(inspect['num_cols'])

    # there is an extra response variable
    if inspect['num_cols'] != (colCount + 1):
        raise Exception(
            "parse created result with the wrong number of cols %s %s" %
            (inspect['num_cols'], colCount))
    if inspect['num_rows'] != totalRows:
        raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
        (inspect['num_rows'], rowCount))

    # hack it in! for test purposees only
    parseKey['python_source_key'] = csvFilenameReplgz
    parseKey['num_rows'] = inspect['num_rows']
    parseKey['num_cols'] = inspect['num_cols']
    parseKey['value_size_bytes'] = inspect['value_size_bytes']
    return parseKey