Example #1
0
    def test_simple2(self):
        # h2o-dev doesn't take ../.. type paths? make find_file return absolute path
        a_node = h2o.nodes[0]

        # import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv"))
        import_result = a_node.import_files(path=find_file("smalldata/poker/poker-hand-testing.data"))
        # print dump_json(import_result)

        k = import_result['keys'][0]
        # frames_result = a_node.frames(key=k[0], len=5)

        frames_result = a_node.frames(key=k)

        frame = frames_result['frames'][0]
        byteSize = frame['byteSize']
        rows = frame['rows']
        columns = frame['columns']
        for c in columns:
            label = c['label']
            missing = c['missing']
            stype = c['type']
            zeros = c['zeros']
            domain = c['domain']

        # print dump_json(frame)

        # how do you parse multiple files
        parse_result = a_node.parse(key=k)

        frame = parse_result['frames'][0]
        hex_key = frame['key']['name']

        verboseprint(hex_key, ":", dump_json(parse_result))
Example #2
0
    def test_simple2(self):
        # h2o-dev doesn't take ../.. type paths? make find_file return absolute path
        # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data")
        csvPathname = find_file("smalldata/logreg/prostate.csv")
        import_result = h2o.n0.import_files(path=csvPathname)
        # print dump_json(import_result)

        k = import_result['keys'][0]
        frames_result = h2o.n0.frames(key=k)

        frame = frames_result['frames'][0]
        rows = frame['rows']
        columns = frame['columns']
        for c in columns:
            label = c['label']
            missing = c['missing_count']
            stype = c['type']
            domain = c['domain']

        # print dump_json(frame)

        # let's see what ray's util does
        frames = h2o.n0.frames()['frames']
        frames_dict = h2o_util.list_to_dict(frames, 'key/name')
        # print "frames:", dump_json(frames)
        # print "frames_dict:", dump_json(frames_dict)
        for k,v in frames_dict.items():
            print "frames_dict key:", k

        # interesting. we can do dictionary comprehensions
        # { k:v for k,v in my_dict.items() if 'Peter' in k }

        # how do you parse multiple files
        parse_result = h2o.n0.parse(key=k, intermediateResults=DO_INTERMEDIATE_RESULTS)

        frame = parse_result['frames'][0]
        hex_key = frame['key']['name']

        colCount = 9
        rowCount = 380
        # colCount = 11
        # rowCount = 1000000
        start = time.time()
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "Inspect:", hex_key, "took", time.time() - start, "seconds"
        numCols = len(inspect['frames'][0]['columns'])
        numRows = inspect['frames'][0]['rows']
        print "\n" + csvPathname, \
            "    rows:", "{:,}".format(numRows), \
            "    len(columns):", "{:,}".format(numCols)

        # should match # of cols in header or ??
        self.assertEqual(numCols, colCount,
            "parse created result with the wrong number of cols %s %s" % (numCols, colCount))
        self.assertEqual(numRows, rowCount,
            "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
            (numRows, rowCount))

        verboseprint(hex_key, ":", dump_json(parse_result))
Example #3
0
    def upload_file(self, f, progress=None):
        # FIX! we won't find it here if it's hdfs://172.16.2.151/ file
        f = find_file(f)
        if f not in self.uploaded:
            start = time.time()
            import md5

            m = md5.new()
            m.update(open(f).read())
            m.update(getpass.getuser())
            dest = '/tmp/' + m.hexdigest() + "-" + os.path.basename(f)

            # sigh. we rm/create sandbox in build_cloud now
            # (because nosetests doesn't exec h2o_main and we
            # don't want to code "clean_sandbox()" in all the tests.
            # So: we don't have a sandbox here, or if we do, we're going to delete it.
            # Just don't log anything until build_cloud()? that should be okay?
            # we were just logging this upload message..not needed.
            # log('Uploading to %s: %s -> %s' % (self.http_addr, f, dest))
            sftp = self.ssh.open_sftp()
            # check if file exists on remote side
            # does paramiko have issues with big files? (>1GB, or 650MB?). maybe we don't care.
            # This would arise (as mentioned in the source, line no 667,
            # http://www.lag.net/paramiko/docs/paramiko.sftp_client-pysrc.html) when there is
            # any error reading the packet or when there is EOFError

            # but I'm getting sftp close here randomly at sm.
            # http://stackoverflow.com/questions/22708942/python-paramiko-module-error-with-callback
            # http://stackoverflow.com/questions/15010540/paramiko-sftp-server-connection-dropped
            # http://stackoverflow.com/questions/12322210/handling-paramiko-sshexception-server-connection-dropped
            try:
                # note we don't do a md5 compare. so if a corrupted file was uploaded we won't re-upload
                # until we do another build.
                sftp.stat(dest)
                print "{0} Skipping upload of file {1}. File {2} exists on remote side!".format(
                    self, f, dest)
            except IOError, e:
                # if self.channel.closed or self.channel.exit_status_ready():
                #     raise Exception("something bad happened to our %s being used for sftp. keepalive? %s %s" % \
                #         (self, self.channel.closed, self.channel.exit_status_ready()))

                if e.errno == errno.ENOENT:  # no such file or directory
                    verboseprint("{0} uploading file {1}".format(self, f))
                    sftp.put(f, dest, callback=progress)
                    # if you want to track upload times
                    ### print "\n{0:.3f} seconds".format(time.time() - start)
                elif e.errno == errno.EEXIST:  # File Exists
                    pass
                else:
                    print "Got unexpected errno: %s on paramiko sftp." % e.errno
                    print "Lookup here: https://docs.python.org/2/library/errno.html"
                    # throw the exception again, if not what we expected
                    exc_info = sys.exc_info()
                    raise exc_info[1], None, exc_info[2]
            finally:
Example #4
0
    def upload_file(self, f, progress=None):
        # FIX! we won't find it here if it's hdfs://172.16.2.151/ file
        f = find_file(f)
        if f not in self.uploaded:
            start = time.time()
            import md5

            m = md5.new()
            m.update(open(f).read())
            m.update(getpass.getuser())
            dest = '/tmp/' + m.hexdigest() + "-" + os.path.basename(f)

            # sigh. we rm/create sandbox in build_cloud now
            # (because nosetests doesn't exec h2o_main and we
            # don't want to code "clean_sandbox()" in all the tests.
            # So: we don't have a sandbox here, or if we do, we're going to delete it.
            # Just don't log anything until build_cloud()? that should be okay?
            # we were just logging this upload message..not needed.
            # log('Uploading to %s: %s -> %s' % (self.http_addr, f, dest))
            sftp = self.ssh.open_sftp()
            # check if file exists on remote side
            # does paramiko have issues with big files? (>1GB, or 650MB?). maybe we don't care.
            # This would arise (as mentioned in the source, line no 667, 
            # http://www.lag.net/paramiko/docs/paramiko.sftp_client-pysrc.html) when there is 
            # any error reading the packet or when there is EOFError

            # but I'm getting sftp close here randomly at sm.
            # http://stackoverflow.com/questions/22708942/python-paramiko-module-error-with-callback
            # http://stackoverflow.com/questions/15010540/paramiko-sftp-server-connection-dropped
            # http://stackoverflow.com/questions/12322210/handling-paramiko-sshexception-server-connection-dropped
            try:
                # note we don't do a md5 compare. so if a corrupted file was uploaded we won't re-upload 
                # until we do another build.
                sftp.stat(dest)
                print "{0} Skipping upload of file {1}. File {2} exists on remote side!".format(self, f, dest)
            except IOError, e:
                # if self.channel.closed or self.channel.exit_status_ready():
                #     raise Exception("something bad happened to our %s being used for sftp. keepalive? %s %s" % \
                #         (self, self.channel.closed, self.channel.exit_status_ready()))

                if e.errno == errno.ENOENT: # no such file or directory
                    verboseprint("{0} uploading file {1}".format(self, f))
                    sftp.put(f, dest, callback=progress)
                    # if you want to track upload times
                    ### print "\n{0:.3f} seconds".format(time.time() - start)
                elif e.errno == errno.EEXIST: # File Exists
                    pass
                else:
                    print "Got unexpected errno: %s on paramiko sftp." % e.errno
                    print "Lookup here: https://docs.python.org/2/library/errno.html"
                    # throw the exception again, if not what we expected
                    exc_info = sys.exc_info()
                    raise exc_info[1], None, exc_info[2]
            finally:
Example #5
0
    def test_simple2(self):
        # h2o-dev doesn't take ../.. type paths? make find_file return absolute path
        a_node = h2o.nodes[0]

        import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv"))
        print dump_json(import_result)

        frames = a_node.frames(key=import_result['keys'][0], len=5)['frames']
        print dump_json(frames)

        parse_result = a_node.parse(key=import_result['keys'][0])
        hex_key = parse_result['frames'][0]['key']['name']
        verboseprint(hex_key, ":", dump_json(parse_result))
Example #6
0
def upload_jar_to_remote_hosts(hosts, slow_connection=False):
    def prog(sofar, total):
        # output is bad for jenkins.
        username = getpass.getuser()
        if username != 'jenkins':
            p = int((10.0*sofar)/total)
            sys.stdout.write('\rUploading jar [%s%s] %02d%%' % ('#'*p, ' ' * (10-p), (100*sofar)/total))
            sys.stdout.flush()

    if not slow_connection:
        for h in hosts:
            f = find_file('target/h2o.jar')
            h.upload_file(f, progress=prog)
            # skipping progress indicator for the flatfile
            h.upload_file(flatfile_pathname())
    else:
        f = find_file('target/h2o.jar')
        hosts[0].upload_file(f, progress=prog)
        hosts[0].push_file_to_remotes(f, hosts[1:])

        f = find_file(flatfile_pathname())
        hosts[0].upload_file(f, progress=prog)
        hosts[0].push_file_to_remotes(f, hosts[1:])
Example #7
0
def upload_jar_to_remote_hosts(hosts, slow_connection=False):
    def prog(sofar, total):
        # output is bad for jenkins.
        username = getpass.getuser()
        if username != 'jenkins':
            p = int((10.0 * sofar) / total)
            sys.stdout.write('\rUploading jar [%s%s] %02d%%' %
                             ('#' * p, ' ' * (10 - p), (100 * sofar) / total))
            sys.stdout.flush()

    if not slow_connection:
        for h in hosts:
            f = find_file('build/h2o.jar')
            h.upload_file(f, progress=prog)
            # skipping progress indicator for the flatfile
            h.upload_file(h2o_bc.flatfile_pathname())
    else:
        f = find_file('build/h2o.jar')
        hosts[0].upload_file(f, progress=prog)
        hosts[0].push_file_to_remotes(f, hosts[1:])

        f = find_file(h2o_bc.flatfile_pathname())
        hosts[0].upload_file(f, progress=prog)
        hosts[0].push_file_to_remotes(f, hosts[1:])
Example #8
0
 def get_h2o_jar(self):
     return find_file('build/h2o.jar')
Example #9
0
    def get_args(self):
        args = ['java']

        # I guess it doesn't matter if we use flatfile for both now
        # defaults to not specifying
        # FIX! we need to check that it's not outside the limits of the dram of the machine it's running on?
        if self.java_heap_GB is not None:
            if not (1 <= self.java_heap_GB <= 256):
                raise Exception('java_heap_GB <1 or >256  (GB): %s' % (self.java_heap_GB))
            args += ['-Xms%dG' % self.java_heap_GB]
            args += ['-Xmx%dG' % self.java_heap_GB]

        if self.java_heap_MB is not None:
            if not (1 <= self.java_heap_MB <= 256000):
                raise Exception('java_heap_MB <1 or >256000  (MB): %s' % (self.java_heap_MB))
            args += ['-Xms%dm' % self.java_heap_MB]
            args += ['-Xmx%dm' % self.java_heap_MB]

        if self.java_extra_args is not None:
            args += ['%s' % self.java_extra_args]

        if self.use_debugger:
            # currently hardwire the base port for debugger to 8000
            # increment by one for every node we add
            # sence this order is different than h2o cluster order, print out the ip and port for the user
            # we could save debugger_port state per node, but not really necessary (but would be more consistent)
            debuggerBasePort = 8000
            if self.node_id is None:
                debuggerPort = debuggerBasePort
            else:
                debuggerPort = debuggerBasePort + self.node_id

            if self.http_addr:
                a = self.http_addr
            else:
                a = "localhost"

            if self.port:
                b = str(self.port)
            else:
                b = "h2o determined"

            # I guess we always specify port?
            print "You can attach debugger at port %s for jvm at %s:%s" % (debuggerPort, a, b)
            args += ['-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=%s' % debuggerPort]

        if self.disable_assertions:
            print "WARNING: h2o is running with assertions disabled"
        else:
            args += ["-ea"]
            

        if self.use_maprfs:
            args += ["-Djava.library.path=/opt/mapr/lib"]

        if self.classpath:
            entries = [find_file('build/classes'), find_file('lib/javassist.jar')]
            entries += glob.glob(find_file('lib') + '/*/*.jar')
            entries += glob.glob(find_file('lib') + '/*/*/*.jar')
            args += ['-classpath', os.pathsep.join(entries), 'water.Boot']
        else:
            args += ["-jar", self.get_h2o_jar()]

        if 1==1:
            if self.hdfs_config:
                args += [
                    '-hdfs_config ' + self.hdfs_config
                ]

        if h2o_args.beta_features:
            # no -beta 
            # args += ["-beta"]
            pass

        if self.network:
            args += ["-network " + self.network]

        # H2O should figure it out, if not specified
        # DON"T EVER USE on multi-machine...h2o should always get it right, to be able to run on hadoop 
        # where it's not told
        # new 10/22/14. Allow forcing the ip when we do remote, for networks with bridges, where
        # h2o can't self identify (does -network work?)
        if self.force_ip and self.h2o_addr: # should always have an addr if force_ip...but..
            args += [
                '-ip %s' % self.h2o_addr,
            ]

        # Need to specify port, since there can be multiple ports for an ip in the flatfile
        if self.port is not None:
            args += [
                "-port %d" % self.port,
            ]

        if self.use_flatfile:
            args += [
                '-flatfile ' + self.flatfile,
            ]

        args += [
            '-ice_root %s' % self.get_ice_dir(),
            # if I have multiple jenkins projects doing different h2o clouds, I need
            # I need different ports and different cloud name.
            # does different cloud name prevent them from joining up
            # (even if same multicast ports?)
            # I suppose I can force a base address. or run on another machine?
        ]
        args += [
            '-name ' + self.cloud_name
        ]

        # ignore the other -hdfs args if the config is used?
        if 1==0:
            if self.hdfs_config:
                args += [
                    '-hdfs_config ' + self.hdfs_config
                ]

        # UPDATE: no longer valid to h2o?
        if 1==0 and self.use_hdfs:
            args += [
                # it's fine if hdfs_name has a ":9000" port or something too
                '-hdfs hdfs://' + self.hdfs_name_node,
                '-hdfs_version ' + self.hdfs_version,
            ]

        # UPDATE: no longer valid to h2o?
        if 1==0 and self.use_maprfs:
            args += [
                # 3 slashes?
                '-hdfs maprfs:///' + self.hdfs_name_node,
                '-hdfs_version ' + self.hdfs_version,
            ]

        if self.aws_credentials:
            args += ['-aws_credentials ' + self.aws_credentials]

        # passed thru build_cloud in test, or global from commandline arg
        if self.random_udp_drop or h2o_args.random_udp_drop:
            args += ['-random_udp_drop']

        if self.force_tcp:
            args += ['-force_tcp']

        if self.disable_h2o_log:
            args += ['-nolog']

        # psutil psopen needs param/value in different arg elements
        # othetwise we'd need to pass as joined string, and run /bin/sh 
        # this joins them up with space, then splits on space. 
        # works as long as no pathnames have embedded space, which should be true
        # for unix, maybe not windows. For windows we join them as string before use in psopen
        argsSplitByWhiteSpace = " ".join(args).split()
        return argsSplitByWhiteSpace
Example #10
0
 def get_h2o_jar(self):
     return find_file('target/h2o.jar')
Example #11
0
    def test_simple2(self):
        # h2o-dev doesn't take ../.. type paths? make find_file return absolute path
        # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data")
        csvPathname = find_file("smalldata/logreg/prostate.csv")
        import_result = h2o.n0.import_files(path=csvPathname)
        # print dump_json(import_result)

        k = import_result['keys'][0]
        frames_result = h2o.n0.frames(key=k)

        frame = frames_result['frames'][0]
        rows = frame['rows']
        columns = frame['columns']
        for c in columns:
            label = c['label']
            missing = c['missing_count']
            stype = c['type']
            domain = c['domain']

        # print dump_json(frame)

        # let's see what ray's util does
        frames = h2o.n0.frames()['frames']
        frames_dict = h2o_util.list_to_dict(frames, 'key/name')
        # print "frames:", dump_json(frames)
        # print "frames_dict:", dump_json(frames_dict)
        for k, v in frames_dict.items():
            print "frames_dict key:", k

        # interesting. we can do dictionary comprehensions
        # { k:v for k,v in my_dict.items() if 'Peter' in k }

        # how do you parse multiple files
        parse_result = h2o.n0.parse(
            key=k, intermediateResults=DO_INTERMEDIATE_RESULTS)

        frame = parse_result['frames'][0]
        hex_key = frame['key']['name']

        colCount = 9
        rowCount = 380
        # colCount = 11
        # rowCount = 1000000
        start = time.time()
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "Inspect:", hex_key, "took", time.time() - start, "seconds"
        numCols = len(inspect['frames'][0]['columns'])
        numRows = inspect['frames'][0]['rows']
        print "\n" + csvPathname, \
            "    rows:", "{:,}".format(numRows), \
            "    len(columns):", "{:,}".format(numCols)

        # should match # of cols in header or ??
        self.assertEqual(
            numCols, colCount,
            "parse created result with the wrong number of cols %s %s" %
            (numCols, colCount))
        self.assertEqual(numRows, rowCount,
            "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
            (numRows, rowCount))

        verboseprint(hex_key, ":", dump_json(parse_result))
Example #12
0
 def get_h2o_jar(self):
     return find_file('target/h2o.jar')