Example #1
0
def run_daemon(script, kwargs, executable=sys.executable):
    args = ['nohup', executable, script] + cmd_args(**kwargs)
    logger.debug(' '.join(args))
    return subprocess.Popen(args,
                            stdout=open('/dev/null', 'w'),
                            stderr=open('/dev/null', 'w'),
                            preexec_fn=os.setpgrp)
Example #2
0
  def custom_getter(self, getter, name, *args, **kwargs):
    # print(kwargs)
    # assert not getter(name, *args, **kwargs) # Variable should not exist in scope yet

    # if not kwargs.get("reuse"):
    #   # return getter(name,*args,**kwargs)
    #   raise Exception()

    n = relpath(name + ':0', self._scope.name)
    # v = self.reuse_vars.get(n + ':0')
    v = self.reuse_vars.get(n)

    # if v: logger.debug("reuse " + n + ':0' + " - " + v.name)
    if v: logger.debug("reuse " + n + " - " + v.name)

    if not v:
      v = getter(name, *args, **kwargs)

      logger.debug("create {} - {}".format(n, v.name))

      if True:  # logger.level == logging.DEBUG:
        col = in_collections(v)
        if not any([tf.GraphKeys.VARIABLES in col, tf.GraphKeys.LOCAL_VARIABLES in col]):
          raise Exception("Error: collections have to contain tf.GraphKeys.VARIABLES or tf.GraphKeys.LOCAL_VARIABLES")

    return v
Example #3
0
File: server.py Project: rmst/chi
 def on_found(self, is_dir, path):
   if is_dir and os.path.exists(os.path.join(path, CONFIG_NAME)):
     e = Experiment(path, self.host, self.port, self)
     self.exps.update({path: e})
     if self.socketio:
       # self.upd()
       logger.debug(f'{len(self.exps)} experiments')
Example #4
0
File: server.py Project: rmst/chi
  def on_created(self, event):
    sleep(3)
    ex = os.path.exists(os.path.join(event.src_path, CONFIG_NAME))
    logger.debug('Folder created ' + str(event))
    logger.debug('is exp ' + str(ex))

    self.on_found(event.is_directory, event.src_path)
Example #5
0
    def __init__(self, path):
        import json
        self._path = path
        try:
            with open(path) as f:
                old_data = json.load(f)
        except json.JSONDecodeError:
            logger.warning('Could not decode config')
            old_data = {}
        except OSError:
            logger.debug('No config file')
            old_data = {}

        for i in range(10):
            try:
                self._f = open(path, 'w+')
                fcntl.flock(self._f, fcntl.LOCK_EX | fcntl.LOCK_NB)
                self._locked = True
                break
            except BlockingIOError:
                import signal
                pid = old_data.get('pid')
                if pid:
                    logger.info(
                        f'Config file is locked (try {i}). Killing previous instance {pid}'
                    )
                    os.kill(pid, signal.SIGTERM)
                    time.sleep(.05)
                else:
                    logger.error(f'Config file is locked and no pid to kill')
        assert self._locked
Example #6
0
    def command(self, cmd):
        import signal
        if cmd == "kill":
            slurm = self.data.get('slurm')
            if slurm:
                jid = slurm.get('SLURM_JOB_ID') or slurm.get('SLURM_JOBID')
                r = subprocess.check_call(('scancel', str(jid)))
                assert r == 0
                return {}
            pid = self.data.get('pid')
            if pid:
                logger.debug('send kill to ' + str(pid))
                os.kill(pid, signal.SIGTERM)
            return dict()

        if cmd == "run":
            from chi.util import run_daemon
            e = self.data.get('sys_executable')
            a = self.data.get('sys_argv')
            k = self.data.get('args')

            if e and a and k:
                run_daemon(a[0], k, e)
                return dict()
            else:
                logger.debug('run failed because of exec ' + str(e) +
                             ' argv ' + str(a) + ' args ' + str(k))
                return dict()
Example #7
0
 def on_moved(self, event):
     """
 event.event_type
     'modified' | 'created' | 'moved' | 'deleted'
 event.is_directory
     True | False
 event.src_path
     path/to/observed/file
 """
     logger.debug(str(event))
Example #8
0
  def __init__(self, variables_dict, tau=0.001):
    vs = tf.get_variable_scope().name
    with tf.name_scope(""):
      self.vars = variables_dict
      name = "{}/ExponentialMovingAverage".format(vs) if vs else "ExponentialMovingAverage"
      self.ema = tf.train.ExponentialMovingAverage(decay=1 - tau, name=name)

      self.upd = self.ema.apply(self.vars.values())  # also creates shadow vars

      self.averages = {n: self.ema.average(v) for n, v in variables_dict.items()}
      logger.debug(self.averages.values()[0].name)
Example #9
0
File: util.py Project: rmst/chi
    def __init__(self, paths):
        super().__init__()
        self.watches = set()

        for p in paths:
            p = os.path.expanduser(p)
            logger.debug('watch ' + p)
            self.watches.add(Repo.observer.schedule(self, p))
            for f in os.scandir(p):
                isinstance(f, os.DirEntry)
                self.on_found(f.is_dir, f.path)
Example #10
0
 def tb_killer(self):
     tb = self.tb
     while tb and not tb.poll():
         if time() - self.tb_t > 60:
             assert isinstance(tb, subprocess.Popen)
             tb.terminate()
             logger.debug('tensorboard for {} kill because timeout'.format(
                 self.path))
             # break
         sleep(5)
     logger.debug('killer finish')
Example #11
0
 def check_tb():
     try:
         url = "http://{}:{}".format(self.host, self.tb_port)
         r = requests.get(
             url)  # requests.head not supported by tensorboard
         available = r.status_code == 200
         sleep(.3)
         logger.debug('tb on {} status {}, {}'.format(
             url, r.status_code, r.reason))
         return available
     except requests.ConnectionError:
         return False
Example #12
0
 def tb_watcher(self):
     assert isinstance(self.tb, subprocess.Popen)
     outs, errs = self.tb.communicate()
     returncode = self.tb.returncode
     self.tb = None
     msg = 'tensorboard on {} for {} returned with code {}'.format(
         self.tb_port, self.path, returncode)
     if returncode == 0:
         logger.debug(msg)
     else:
         logger.warning(f'{msg}\n out: {outs}\n err: {errs}')
     logger.debug('tb watcher finished')
Example #13
0
File: util.py Project: rmst/chi
def get_free(pool):
    for i in range(20):
        av = [p for p in pool if check_free(p)]
        logger.debug('Free ports' + str(av))
        if av:
            break
        sleep(.1)
    if not av:
        logger.error('No ports available')
        return None
    else:
        return av[0]
Example #14
0
    def tensorboard(self):
        has_event_files = glob.glob(self.path + '**/*.tfevents*',
                                    recursive=True)
        if not has_event_files:
            return dict(no_event_files=True)

        elif not self.tb:
            self.tb_port = get_free(self.server.port_pool)
            cmds = [
                'tensorboard', '--logdir', "{}".format(self.path), '--host',
                '0.0.0.0', '--port',
                str(self.tb_port)
            ]
            logger.debug('Start tensorboard with: ' + ' '.join(cmds))
            self.tb = subprocess.Popen(cmds,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       universal_newlines=True)
            Thread(target=self.tb_watcher, daemon=True).start()

            @repeat_until(timeout=6.)
            def check_tb():
                try:
                    url = "http://{}:{}".format(self.host, self.tb_port)
                    r = requests.get(
                        url)  # requests.head not supported by tensorboard
                    available = r.status_code == 200
                    sleep(.3)
                    logger.debug('tb on {} status {}, {}'.format(
                        url, r.status_code, r.reason))
                    return available
                except requests.ConnectionError:
                    return False

            if not check_tb:
                logger.warning('tb could not be started')

            self.tb_t = time()
            Thread(target=self.tb_killer, daemon=True).start()
            return dict(host=self.host,
                        port=self.tb_port,
                        new=True,
                        available=check_tb,
                        no_event_files=False)

        else:
            self.tb_t = time()  # heartbeat
            # print('heartbeat')
            return dict(host=self.host,
                        port=self.tb_port,
                        new=False,
                        available=True,
                        no_event_files=False)
Example #15
0
File: server.py Project: rmst/chi
  def on_connect(self):
    if self.connections == 0:
      pass

    self.emit('info', dict(jupyter_port=self.jupyter_port,
                           user=os.environ.get('USER'),
                           bashrc=self.bashrc,
                           ))

    self.experiments()  # poll file system

    self.connections += 1
    logger.debug(f'connect ({self.connections})')
Example #16
0
File: resource.py Project: rmst/chi
 def wrap(*args, **kwargs):
     key = args
     r = store.get(key)
     if not r:
         r = Resource()
         r.key = key
         r.v = f(*args, **kwargs)
         if hasattr(r.v, 'release'):
             r.release = r._release = r.v.release
         r.to = timeout
         logger.debug('resource stored')
         store.update({key: r})
     r.t = time()
     return r.v
Example #17
0
File: server.py Project: rmst/chi
  def __init__(self, host, port, rootdir, port_pool, polling_interval=20):
    self.port_pool = port_pool
    self.rootdir = rootdir
    self.host = host
    self.port = port
    self.exps = {}

    # Start jupyter
    jpt = shutil.which('jupyter')
    self.jupyter_port = p = get_free(self.port_pool) if jpt else -1
    if jpt:
      csp = str(dict(headers={'Content-Security-Policy':
                              f"frame-ancestors 'self' http://localhost:{self.port}/"}))

      logger.debug(f'Start jupyter ({jpt}) on port {p}')
      self.jupyter = subprocess.Popen([jpt, 'notebook', '--port='+str(p),
                                       '--no-browser', '/',
                                       "--NotebookApp.token=''",
                                       f"--NotebookApp.tornado_settings={csp}",
                                       f"--FileContentsManager.hide_globs=['']"],
                                      stdout=subprocess.DEVNULL,
                                      stderr=subprocess.DEVNULL,
                                      )

    Namespace.__init__(self, '/experiments')

    # Init Repo
    alt = '/tmp/chi_' + getpass.getuser()
    if os.path.exists(alt):
      os.remove(alt)

    root = os.path.expanduser('~/.chi')
    os.symlink(root, alt, target_is_directory=True)
    roots = [rootdir,
             join(root, 'experiments'),
             join(root, 'board'),
             join(root, 'apps')]
    for p in roots:
      mkdirs(os.path.expanduser(p))

    bashrc = os.path.expanduser('~/.chi') + '/bashrc.sh'
    if os.path.exists(bashrc):
      os.remove(bashrc)
    os.symlink(os.path.expanduser('~/.bashrc'), bashrc)
    self.bashrc = bashrc

    self.connections = 0
    Repo.__init__(self, roots)

    Repo.observer.start()
Example #18
0
 def tb_watcher(self):
     assert isinstance(self.tb, subprocess.Popen)
     outs, errs = self.tb.communicate()
     returncode = self.tb.returncode
     self.tb = None
     msg = 'tensorboard on {} for {} returned with code {}'.format(
         self.tb_port, self.path, returncode)
     if returncode == 0:
         logger.debug(msg)
     else:
         logger.warning(msg)
         logger.warning('out: ' + outs)
         logger.warning('err: ' + errs)
     print('watcher finish')
Example #19
0
def remote_install_dependency(address, module):
    user, host = address.split('@')
    rem = f"/home/{user}/.chi/cache"
    repo = join('/', *module.__file__.split('/')[:-2])
    target = address + ':' + rem + repo
    logger.debug(f"Uploading {repo} to {target}")
    copydir(repo, target, with_src=False)
    cmd = f'pip3 install --user -e {rem+repo}'

    try:
        out = subprocess.check_output(['ssh', address, f'echo "{cmd}"; {cmd}'],
                                      universal_newlines=True)
    except subprocess.CalledProcessError as e:
        logger.error(
            f'Install failed with code {e.returncode} and output:\n{e.output}')
        raise e
Example #20
0
  def __init__(self, name, reuse_vars=None, session=None, logging_path=None, **kwargs):
    self._finalized = False
    self.output = None
    self.inputs = None
    self.reuse_vars = reuse_vars or {}

    with tf.variable_scope(name, reuse=False) as self._scope:
      pass

    logger.debug("module: " + self._scope.name)
    # print("variables: ", variables)

    self._scope.set_custom_getter(self.custom_getter)
    self._session = session or tf.get_default_session()
    self._logging_path = logging_path
    self._writer = None
Example #21
0
File: server.py Project: rmst/chi
 def on_deleted(self, event):
   logger.debug(str(event))
   if event.is_directory:
     p = event.src_path
     e = self.exps.get(p)
     if e:
       e.delete()
       del self.exps[p]
       logger.debug('actually deleted exp')
     if self.socketio:
       # self.upd()
       logger.debug(f'{len(self.exps)} experiments')
Example #22
0
File: server.py Project: rmst/chi
  def on_disconnect(self):

    self.connections -= 1
    logger.debug(f'disconnect ({self.connections})')
Example #23
0
 def on_modified(self, event):
     if event.src_path == os.path.join(self.path, CONFIG_NAME):
         logger.debug(f'{event.src_path} modified')
         self.server.upd()
Example #24
0
File: util.py Project: rmst/chi
 def kill():
     f.close()
     if not p.poll():
         logger.debug('Killing process started with ' + str(cmd))
         p.kill()
Example #25
0
    def __init__(self, f, logdir=None, *args, **kwargs):
        """

        :param f: function
        :param logdir: string
        :param args:
        :param kwargs:
        """
        self._step = 0
        # Init model parent class
        Model.__init__(self, f, *args, **kwargs)

        # process inputs
        import collections

        # for each parameters, inputs map parameter name to the placeholder
        self.inputs = collections.OrderedDict()

        # for each parameter, auto_wrap map parameter name to a boolean if the shape is a list
        self.auto_wrap = collections.OrderedDict()
        for name, dtype, shape, default in parse_signature(f):
            # If a default value has been provided, then we don't need to provide an output for this placeholder
            # in the graph, default will be the output
            if default:
                p = tf.placeholder_with_default(default, shape)
            else:
                p = tf.placeholder(dtype, shape, name)
            self.auto_wrap.update({name: isinstance(shape, list)})
            self.inputs.update({name: p})

        # if any parameter has a shape
        self.use_wrap = any(self.auto_wrap.values())

        # build graph
        out = super().__call__(**self.inputs)  # build Model
        self.__dict__.update(self._last_graph.__dict__
                             )  # make SubGraph properties available in self

        # process outputs
        if out is None:
            self.output = tf.no_op()
        # elif self.use_wrap:
        #   self.unwrap = []
        #   self.output = []
        #   if isinstance(out, tuple):
        #     for x in out:
        #       unwrap = isinstance(x, list)
        #       if unwrap:
        #         assert len(x) == 1 and isinstance(x[0], tf.Tensor)
        #         x = x[0]
        #       self.unwrap.append(unwrap)
        #       self.output.append(x)
        #   elif isinstance(out, list):
        #     assert len(out) == 1 and isinstance(out[0], tf.Tensor)
        #     self.output = out[0]

        # self.inputs = self.get_tensors_by_optype("Placeholder")

        # If a log dir is specified, then create a FileWriter, passing the graph just defined
        if logdir:
            current_app = chi.App.current_app
            if not logdir.startswith('/'):
                logger.debug(
                    'logdir path relative to app: {}, app logdir: {}'.format(
                        current_app, current_app.logdir))
                if current_app and current_app.logdir:
                    logdir = current_app.logdir + '/' + logdir
                else:
                    logger.debug(
                        'fall back to logdir path relative to working dir')
                    os.path.abspath('./' + logdir)
            # Write the session graph
            self.writer = tf.summary.FileWriter(
                logdir, graph=chi.chi.get_session().graph)
        else:
            self.writer = None

        # collect activations tensor
        activations = self.get_tensors_by_optype(
            'Relu')  # TODO: generalize to non-Relu
        # activations = self.subgraph.histogram_summaries(activations, 'activations')
        summaries = self.summaries()

        # If there are summaries, and a writer to write to it
        if summaries and self.writer:
            self._summary_op = tf.summary.merge(summaries)

        super().initialize()
Example #26
0
File: main.py Project: rmst/chi
def chiboard(self: chi.Experiment, host='localhost', port=MAGIC_PORT, rootdir='',
             loglevel='debug', timeout=24*60*60, port_pool=""):
  from flask import Flask, jsonify, send_from_directory, send_file
  from chi.board.server import Server
  from chi.board.util import rcollect
  from chi.board.util import get_free_port
  from chi.logger import logger

  import os
  import signal
  from time import time, sleep
  from threading import Thread
  from os.path import expanduser as expandu
  from flask_socketio import SocketIO

  def expanduser(p):
    pa = expandu(p)
    return pa if pa.startswith('/') else '/' + pa

  chi.set_loglevel(loglevel)

  if port == 0:
    port = get_free_port(host)
    print(f'{port}')

  self.config.port = port

  p = os.path.dirname(os.path.realpath(__file__))
  app = Flask(__name__, root_path=p, static_url_path='/')

  socketio = SocketIO(app)

  if rootdir == '':
    import os
    rootdir = os.environ.get('CHI_EXPERIMENTS') or '~'
    logger.debug('Rootdir: ' + rootdir)

  if port_pool:
    port_pool = [int(p) for p in port_pool.split(',')]
  else:
    port_pool = range(port + 1, port + 30)

  server = Server(host, port, rootdir, port_pool)

  remotes = []
  p = expanduser('~/.chi/board/remotes.json')
  if os.path.exists(p):
    with open(p) as f:
      remotes = json.load(f)
      # print(remotes)

  state = dict(last_request=time())

  def killer():
    while time() - state['last_request'] < timeout:
      sleep(2)
    logger.error('timeout')
    os.kill(os.getpid(), signal.SIGINT)  # kill self

  Thread(target=killer, daemon=True).start()

  @app.before_request
  def tick():
    state.update(last_request=time())

  @app.route("/")
  def index():
    return send_file("components/index.html")

  @app.route("/favicon")
  def favicon():
    return send_file("components/favicon.png")

  @app.route('/bower_components/<path:path>')
  def bower(path):
    return send_from_directory('bower_components', path)

  @app.route('/components/<path:path>')
  def comp(path):
    return send_from_directory('components', path)

  @app.route("/exp/")
  def exp():
    return send_file("components/experiment.html")

  @app.route("/info/<string:host>/<path:path>")  # experiment page
  def info(host, path):
    if host == 'local':
      return jsonify(server.info(expanduser(path)))
    else:
      raise Exception('Remote not yet supported')
      # request scripts info
      # update urls

  @app.route("/logs/<path:path>")
  def logs(path):
    data = []

    def key(x):
      k = '_' if x == 'stdout' else x
      return k

    path = expanduser(path) + '/logs'

    for p in sorted(os.listdir(path), key=key):
      with open(path + '/' + p, 'r') as f:
        f.seek(0, os.SEEK_END)
        l = f.tell()
        f.seek(max((0, l - 50000)), 0)
        c = f.read()
        while c and c[-1] == '\n':
          c = c[:-1]
        # c = c.replace('\n', '<br>')
        # c = c.replace('<', '&lt;')
        data.append({'name': os.path.basename(p), 'content': c})

    return jsonify(data)

  @app.route("/tb/<string:host>/<path:path>")
  def tb(host, path):
    if host == 'local':
      return jsonify(server.tensorboard(expanduser(path)))
    else:
      raise Exception('Remote not yet supported')
      # make local port forward
      # request scripts tensorboard
      # update urls

  @app.route("/delete/<path:path>")
  def delete(path):
    return jsonify(server.delete(expanduser(path)))

  @app.route("/trend/<path:path>")
  def trend(path):
    sio = server.trend('/' + path)
    return send_file(sio, attachment_filename='trend.png', mimetype='image/png')

  @app.route("/<string:cmd>/<path:path>")
  def command(cmd, path):
    return jsonify(server.command(cmd, expanduser(path)))

  try:
    socketio.on_namespace(server)
    socketio.run(app, host=host, port=port, log_output=loglevel == 'debug')
  finally:
    server.shutdown()
Example #27
0
File: resource.py Project: rmst/chi
 def release(self):
     logger.debug('release rsrc')
     if hasattr(self, '_release'):
         self._release()
     del store[self.key]