Exemple #1
0
 def file_current(self, fname, md5):
     """Checksum a file and compare the md5 with the known md5
     """
     return os.path.isfile(fname) and util.md5_file(fname) == md5
Exemple #2
0
    def _sync_etc(self, headless=False):
        # Ignore SIGQUIT (ctrl-\). The child process will # handle it, and we'll
        # exit when the child process does.
        #
        # We disable these signals after running the process so the child doesn't
        # inherit this behaviour.
        try:
            signal.signal(signal.SIGQUIT, signal.SIG_IGN)
        except AttributeError:  # SIGQUIT doesn't exist on windows
            pass

        exitcode = None
        try:
            while True:
                res = bytearray()
                try:
                    res = self._socket.recv(2)
                except socket.timeout:
                    pass
                if len(res) == 2 and res[0] == 2:
                    exitcode = res[1]
                    break
                elif len(res) > 0:
                    wandb.termerror(
                        "Invalid message received from child process: %s" %
                        str(res))
                    break
                else:
                    exitcode = self.proc.poll()
                    if exitcode is not None:
                        break
                    time.sleep(1)
        except KeyboardInterrupt:
            exitcode = 255
            wandb.termlog('Ctrl-c pressed; waiting for program to end.')
            keyboard_interrupt_time = time.time()
            if not headless:
                # give the process a couple of seconds to die, then kill it
                while self.proc.poll() is None and (
                        time.time() - keyboard_interrupt_time) < 2:
                    time.sleep(0.1)
                if self.proc.poll() is None:
                    wandb.termlog('Program still alive. Killing it.')
                    try:
                        self.proc.kill()
                    except OSError:
                        pass
        """TODO(adrian): garbage that appears in the logs sometimes

        Exception ignored in: <bound method Popen.__del__ of <subprocess.Popen object at 0x111adce48>>
        Traceback (most recent call last):
          File "/Users/adrian/.pyenv/versions/3.6.0/Python.framework/Versions/3.6/lib/python3.6/subprocess.py", line 760, in __del__
        AttributeError: 'NoneType' object has no attribute 'warn'
        """
        wandb.termlog()

        if exitcode is None:
            exitcode = 254
            wandb.termlog(
                'Killing program failed; syncing files anyway. Press ctrl-c to abort syncing.'
            )
        else:
            if exitcode == 0:
                wandb.termlog('Program ended.')
            else:
                wandb.termlog(
                    'Program failed with code %d. Press ctrl-c to abort syncing.'
                    % exitcode)
        #termlog('job (%s) Process exited with code: %s' % (program, exitcode))

        self._meta.data["exitcode"] = exitcode
        if exitcode == 0:
            self._meta.data["state"] = "finished"
        elif exitcode == 255:
            self._meta.data["state"] = "killed"
        else:
            self._meta.data["state"] = "failed"
        self._meta.shutdown()
        self._system_stats.shutdown()
        self._close_stdout_stderr_streams(exitcode or 254)

        # If we're not syncing to the cloud, we're done
        if not self._cloud:
            self._socket.done()
            return None

        # Show run summary/history
        self._run.summary.load()
        summary = self._run.summary._summary
        if len(summary):
            wandb.termlog('Run summary:')
            max_len = max([len(k) for k in summary.keys()])
            format_str = '  {:>%s} {}' % max_len
            for k, v in summary.items():
                wandb.termlog(format_str.format(k, v))
            self._run.history.load()

        history_keys = self._run.history.keys()
        if len(history_keys):
            wandb.termlog('Run history:')
            max_len = max([len(k) for k in history_keys])
            for key in history_keys:
                vals = util.downsample(self._run.history.column(key), 40)
                line = sparkline.sparkify(vals)
                format_str = u'  {:>%s} {}' % max_len
                wandb.termlog(format_str.format(key, line))

        if self._run.has_examples:
            wandb.termlog('Saved %s examples' % self._run.examples.count())

        wandb.termlog('Waiting for final file modifications.')
        # This is a a heuristic delay to catch files that were written just before
        # the end of the script.
        # TODO: ensure we catch all saved files.
        # TODO(adrian): do we need this?
        time.sleep(2)
        try:
            # avoid hanging if we crashed before the observer was started
            if self._observer.is_alive():
                self._observer.stop()
                self._observer.join()
        # TODO: py2 TypeError: PyCObject_AsVoidPtr called with null pointer
        except TypeError:
            pass
        # TODO: py3 SystemError: <built-in function stop> returned a result with an error set
        except SystemError:
            pass

        for handler in self._event_handlers.values():
            handler.finish()
        self._file_pusher.finish()

        wandb.termlog('Syncing files in %s:' %
                      os.path.relpath(self._watch_dir))
        for file_path in self._stats.files():
            wandb.termlog('  %s' % os.path.relpath(file_path, self._watch_dir))
        step = 0
        spinner_states = ['-', '\\', '|', '/']
        stop = False
        self._stats.update_all_files()
        while True:
            if not self._file_pusher.is_alive():
                stop = True
            summary = self._stats.summary()
            line = (
                ' %(completed_files)s of %(total_files)s files,'
                ' %(uploaded_bytes).03f of %(total_bytes).03f bytes uploaded\r'
                % summary)
            line = spinner_states[step % 4] + line
            step += 1
            wandb.termlog(line, newline=False)
            if stop:
                break
            time.sleep(0.25)
            #print('FP: ', self._file_pusher._pending, self._file_pusher._jobs)
        # clear progress line.
        wandb.termlog(' ' * 79)

        # Check md5s of uploaded files against what's on the file system.
        # TODO: We're currently using the list of uploaded files as our source
        #     of truth, but really we should use the files on the filesystem
        #     (ie if we missed a file this wouldn't catch it).
        # This polls the server, because there a delay between when the file
        # is done uploading, and when the datastore gets updated with new
        # metadata via pubsub.
        wandb.termlog('Verifying uploaded files... ', newline=False)
        error = False
        mismatched = None
        for delay_base in range(4):
            mismatched = []
            download_urls = self._api.download_urls(self._project,
                                                    run=self._run.id)
            for fname, info in download_urls.items():
                if fname == 'wandb-history.h5' or OUTPUT_FNAME:
                    continue
                local_path = os.path.join(self._watch_dir, fname)
                local_md5 = util.md5_file(local_path)
                if local_md5 != info['md5']:
                    mismatched.append((local_path, local_md5, info['md5']))
            if not mismatched:
                break
            wandb.termlog('  Retrying after %ss' % (delay_base**2))
            time.sleep(delay_base**2)

        if mismatched:
            print('')
            error = True
            for local_path, local_md5, remote_md5 in mismatched:
                wandb.termerror(
                    '%s (%s) did not match uploaded file (%s) md5' %
                    (local_path, local_md5, remote_md5))
        else:
            print('verified!')

        if error:
            wandb.termerror('Sync failed %s' % self.url)
        else:
            wandb.termlog('Synced %s' % self.url)

        if headless:
            self._socket.done()