コード例 #1
0
    def test_nulls_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # we're going to insert <NUL> (0x0) in between every byte!
        # and then use it. move to a large file. I suppose
        # we could compare the results to a non-munged file with the same algo
        # I suppose the <NUL> are thrown away by parse, so doesn't change
        # chunk boundary stuff. (i.e. not interesting test for RF)
        csvFilename = 'poker1000'
        csvPathname = 'poker/' + csvFilename
        fullPathname = h2i.find_folder_and_filename('smalldata',
                                                    csvPathname,
                                                    returnFullPath=True)

        nulFilename = "syn_nul.data"
        nulPathname = SYNDATASETS_DIR + '/' + nulFilename

        piece_size = 4096  # 4 KiB

        with open(fullPathname, "rb") as in_file:
            with open(nulPathname, "wb") as out_file:
                while True:
                    piece = in_file.read(103)
                    if piece == "":
                        break  # end of file

                    # we could just extend piece?
                    # start with a null
                    withNuls = bytearray(piece)
                    # FIX! we'll eventually stick a <NUL> after every byte!
                    withNuls.extend(bytearray.fromhex('00'))
                    out_file.write(withNuls)

        for trials in xrange(1, 2):
            trees = 6
            for x in xrange(161, 240, 40):
                y = 10000 * x
                print "\nTrial:", trials, ", y:", y

                timeoutSecs = 20 + 5 * (len(h2o.nodes))
                model_key = csvFilename + "_" + str(trials)

                parseResult = h2i.import_parse(path=nulPathname, schema='put')
                h2o_cmd.runRF(parseResult=parseResult,
                              trees=trees,
                              destination_key=model_key,
                              timeoutSecs=timeoutSecs,
                              retryDelaySecs=1)
                sys.stdout.write('.')
                sys.stdout.flush()

                # partial clean, so we can look at tree builds from this run if hang
                h2o.clean_sandbox_stdout_stderr()
コード例 #2
0
ファイル: test_w_hosts_fail1.py プロジェクト: segahm/h2o
    def test_B_GenParity1(self):
        # Create a directory for the created dataset files. ok if already exists
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        print "\nGenerating some large row count parity datasets in", SYNDATASETS_DIR,
        print "\nmay be a minute.........."
        for x in xrange(161, 240, 20):
            # more rows!
            y = 10000 * x
            # Have to split the string out to list for pipe
            shCmdString = "perl " + h2o.find_file(
                "syn_scripts/parity.pl") + " 128 4 " + str(y) + " quad"
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to
            # wait for the last one to be gen'ed here before we start the first below.
            # UPDATE: maybe EC2 takes a long time to spawn a process?
            h2o.spawn_cmd_and_wait('parity.pl',
                                   shCmdString.split(),
                                   timeout=90)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            sys.stdout.write('.')
            sys.stdout.flush()
        print "\nDatasets generated. Using."

        # always match the gen above!
        # Let's try it twice!
        for trials in xrange(1, 7):
            # prime
            trees = 6

            for x in xrange(161, 240, 20):
                y = 10000 * x
                print "\nTrial:", trials, ", y:", y

                csvFilename = "parity_128_4_" + str(y) + "_quad.data"
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                # FIX! TBD do we always have to kick off the run from node 0?
                # random guess about length of time, varying with more hosts/nodes?
                timeoutSecs = 20 + 5 * (len(h2o.nodes))

                # change the model name each iteration, so they stay in h2o
                model_key = csvFilename + "_" + str(trials)
                h2o_cmd.runRF(trees=trees,
                              model_key=model_key,
                              timeoutSecs=timeoutSecs,
                              retryDelaySecs=1,
                              csvPathname=csvPathname)
                sys.stdout.write('.')
                sys.stdout.flush()

                # partial clean, so we can look at tree builds from this run if hang
                h2o.clean_sandbox_stdout_stderr()
コード例 #3
0
ファイル: test_w_hosts_fail1.py プロジェクト: Jfeng3/h2o
    def test_B_GenParity1(self):
        # Create a directory for the created dataset files. ok if already exists
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # always match the run below!
        print "\nGenerating some large row count parity datasets in", SYNDATASETS_DIR,
        print "\nmay be a minute.........."
        for x in xrange(161, 240, 20):
            # more rows!
            y = 10000 * x
            # Have to split the string out to list for pipe
            shCmdString = (
                "perl " + h2o.find_file("syn_scripts/parity.pl") + " 128 4 " + str(y) + " quad " + SYNDATASETS_DIR
            )
            # FIX! as long as we're doing a couple, you'd think we wouldn't have to
            # wait for the last one to be gen'ed here before we start the first below.
            # UPDATE: maybe EC2 takes a long time to spawn a process?
            h2o.spawn_cmd_and_wait("parity.pl", shCmdString.split(), timeout=90)
            # the algorithm for creating the path and filename is hardwired in parity.pl..i.e
            csvFilename = "parity_128_4_" + str(x) + "_quad.data"
            sys.stdout.write(".")
            sys.stdout.flush()
        print "\nDatasets generated. Using."

        # always match the gen above!
        # Let's try it twice!
        for trials in xrange(1, 7):
            # prime
            trees = 6

            for x in xrange(161, 240, 20):
                y = 10000 * x
                print "\nTrial:", trials, ", y:", y

                csvFilename = "parity_128_4_" + str(y) + "_quad.data"
                csvPathname = SYNDATASETS_DIR + "/" + csvFilename
                # FIX! TBD do we always have to kick off the run from node 0?
                # random guess about length of time, varying with more hosts/nodes?
                timeoutSecs = 20 + 5 * (len(h2o.nodes))

                # change the model name each iteration, so they stay in h2o
                model_key = csvFilename + "_" + str(trials)
                parseResult = h2i.import_parse(path=csvPathname, schema="put")
                h2o_cmd.runRF(
                    parseResult=parseResult, trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1
                )
                sys.stdout.write(".")
                sys.stdout.flush()

                # partial clean, so we can look at tree builds from this run if hang
                h2o.clean_sandbox_stdout_stderr()
コード例 #4
0
ファイル: test_nulls.py プロジェクト: Jfeng3/h2o
    def test_file_with_nul_chars_inserted(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # we're going to insert <NUL> (0x0) in between every byte!
        # and then use it. move to a large file. I suppose
        # we could compare the results to a non-munged file with the same algo
        # I suppose the <NUL> are thrown away by parse, so doesn't change
        # chunk boundary stuff. (i.e. not interesting test for RF)
        csvFilename = "poker1000"
        csvPathname = "poker/" + csvFilename
        fullPathname = h2i.find_folder_and_filename("smalldata", csvPathname, returnFullPath=True)

        nulFilename = "syn_nul.data"
        nulPathname = SYNDATASETS_DIR + "/" + nulFilename

        piece_size = 4096  # 4 KiB

        with open(fullPathname, "rb") as in_file:
            with open(nulPathname, "wb") as out_file:
                while True:
                    piece = in_file.read(103)
                    if piece == "":
                        break  # end of file

                    # we could just extend piece?
                    # start with a null
                    withNuls = bytearray(piece)
                    # FIX! we'll eventually stick a <NUL> after every byte!
                    withNuls.extend(bytearray.fromhex("00"))
                    out_file.write(withNuls)

        for trials in xrange(1, 2):
            trees = 6
            for x in xrange(161, 240, 40):
                y = 10000 * x
                print "\nTrial:", trials, ", y:", y

                timeoutSecs = 20 + 5 * (len(h2o.nodes))
                model_key = csvFilename + "_" + str(trials)

                parseResult = h2i.import_parse(path=nulPathname, schema="put")
                h2o_cmd.runRF(
                    parseResult=parseResult, trees=trees, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1
                )
                sys.stdout.write(".")
                sys.stdout.flush()

                # partial clean, so we can look at tree builds from this run if hang
                h2o.clean_sandbox_stdout_stderr()