from compat import unicode from extras import json_encode, json_decode, DotDict from dispatch import JobError from status import statmsg_sink, children, print_status_stacks, status_stacks_export DEBUG_WRITE_JSON = False def gen_cookie(size=16): return ''.join(random.choice(ascii_letters) for _ in range(size)) # This contains cookie: {lock, last_error, last_time} for all jobs, main jobs have cookie None. job_tracking = {None: DotDict(lock=JLock(), last_error=None, last_time=0)} # This needs .ctrl to work. It is set from main() class XtdHandler(BaseWebHandler): server_version = "scx/0.1" DEBUG = not True def log_message(self, format, *args): return def encode_body(self, body): if isinstance(body, bytes): return body if isinstance(body, unicode): return body.encode('utf-8')
def _handle_req(self, path, args): if path[0] == 'status': data = job_tracking.get(args.get('subjob_cookie') or None) if not data: self.do_response(500, 'text/plain', 'bad subjob_cookie!\n' ) return timeout = min(float(args.get('timeout', 0)), 128) status = DotDict(idle=data.lock.acquire(False)) deadline = time.time() + timeout while not status.idle and time.time() < deadline: time.sleep(0.1) status.idle = data.lock.acquire(False) if status.idle: if data.last_error: status.last_error = data.last_error data.last_error = None else: status.last_time = data.last_time data.lock.release() elif path == ['status', 'full']: status.status_stacks, status.current = status_stacks_export() self.do_response(200, "text/json", status) return elif path==['list_workspaces']: ws = {k: v.path for k, v in self.ctrl.list_workspaces().items()} self.do_response(200, "text/json", ws) elif path==['config']: self.do_response(200, "text/json", self.ctrl.config) elif path==['update_methods']: self.do_response(200, "text/json", self.ctrl.update_methods()) elif path==['methods']: """ return a json with everything the Method object knows about the methods """ self.do_response(200, "text/json", self.ctrl.get_methods()) elif path[0]=='method_info': method = path[1] self.do_response(200, "text/json", self.ctrl.method_info(method)) elif path[0]=='set_workspace': _ws = path[1] if _ws not in self.ctrl.list_workspaces(): self.do_response(500,'text/plain', 'Undefined workspace \"%s\"\n' % _ws) else: self.ctrl.set_workspace(_ws) self.do_response(200,'text/plain', 'Workspace set to \"%s\"\n' % _ws) elif path[0]=='workspace_info': self.do_response(200, 'text/json', self.ctrl.get_workspace_details()) elif path[0] == 'abort': tokill = list(children) print('Force abort', tokill) for child in tokill: os.killpg(child, signal.SIGKILL) self.do_response(200, 'text/json', {'killed': len(tokill)}) elif path==['submit']: if self.ctrl.broken: self.do_response(500, "text/json", {'broken': self.ctrl.broken, 'error': 'Broken methods: ' + ', '.join(sorted(m.split('.')[-1][2:] for m in self.ctrl.broken))}) elif 'xml' in args: self.do_response(500, 'text/plain', 'JSON > XML!\n' ) elif 'json' in args: if DEBUG_WRITE_JSON: with open('DEBUG_WRITE.json', 'wb') as fh: fh.write(args['json']) setup = json_decode(args['json']) data = job_tracking.get(setup.get('subjob_cookie') or None) if not data: self.do_response(500, 'text/plain', 'bad subjob_cookie!\n' ) return if len(job_tracking) - 1 > 5: # max five levels print('Too deep subjob nesting!') self.do_response(500, 'text/plain', 'Too deep subjob nesting') return if data.lock.acquire(False): respond_after = True try: if self.DEBUG: print('@daemon.py: Got the lock!', file=sys.stderr) jobidv, job_res = self.ctrl.initialise_jobs(setup) job_res['done'] = False if jobidv: error = [] tlock = TLock() link2job = {j['link']: j for j in job_res['jobs'].values()} def run(jobidv, tlock): for jobid in jobidv: passed_cookie = None # This is not a race - all higher locks are locked too. while passed_cookie in job_tracking: passed_cookie = gen_cookie() job_tracking[passed_cookie] = DotDict(lock=JLock(), last_error=None, last_time=0) try: self.ctrl.run_job(jobid, subjob_cookie=passed_cookie, parent_pid=setup.get('parent_pid', 0)) # update database since a new jobid was just created job = self.ctrl.add_single_jobid(jobid) with tlock: link2job[jobid]['make'] = 'DONE' link2job[jobid]['total_time'] = job.total except JobError as e: error.append([e.jobid, e.method, e.status]) with tlock: link2job[jobid]['make'] = 'FAIL' return finally: del job_tracking[passed_cookie] # everything was built ok, update symlink try: wn = self.ctrl.current_workspace dn = self.ctrl.workspaces[wn].path ln = os.path.join(dn, wn + "-LATEST_") try: os.unlink(ln) except OSError: pass os.symlink(jobid, ln) os.rename(ln, os.path.join(dn, wn + "-LATEST")) except Exception: pass # meh t = Thread(target=run, name="job runner", args=(jobidv, tlock,)) t.daemon = True t.start() t.join(2) # give job two seconds to complete with tlock: for j in link2job.values(): if j['make'] in (True, 'FAIL',): respond_after = False job_res_json = json_encode(job_res) break if not respond_after: # not all jobs are done yet, give partial response self.do_response(200, "text/json", job_res_json) t.join() # wait until actually complete del tlock del t # verify that all jobs got built. total_time = 0 for j in link2job.values(): jobid = j['link'] if j['make'] == True: # Well, crap. error.append([jobid, "unknown", {"INTERNAL": "Not built"}]) print("INTERNAL ERROR IN JOB BUILDING!", file=sys.stderr) total_time += j.get('total_time', 0) data.last_error = error data.last_time = total_time except Exception as e: if respond_after: self.do_response(500, "text/json", {'error': str(e)}) raise finally: data.lock.release() if respond_after: job_res['done'] = True self.do_response(200, "text/json", job_res) if self.DEBUG: print("@daemon.py: Process releases lock!", file=sys.stderr) # note: has already done http response else: self.do_response(200, 'text/plain', 'Busy doing work for you...\n') else: self.do_response(500, 'text/plain', 'Missing json input!\n' ) else: self.do_response(500, 'text/plain', 'Unknown path\n' ) return
def as_dep(self): return DotDict(timestamp=self.timestamp, joblist=self.joblist, caption=self.caption, _default=lambda: None)
def synthesis(params, analysis_res, prepare_res): r = report() res = DotDict() d = datasets.source analysis_res = list(analysis_res) if options.filter_bad: num_lines_per_split = [ num - data[1] for num, data in zip(d.lines, analysis_res) ] res.bad_line_count_per_slice = [data[1] for data in analysis_res] res.bad_line_count_total = sum(res.bad_line_count_per_slice) r.println('Slice Bad line count') for sliceno, cnt in enumerate(res.bad_line_count_per_slice): r.println('%5d %d' % ( sliceno, cnt, )) r.println('total %d' % (res.bad_line_count_total, )) r.line() r.println('Slice Bad line number') reported_count = 0 for sliceno, data in enumerate(analysis_res): fn = 'badmap%d' % (sliceno, ) if data[1] and reported_count < 32: with open(fn, 'rb') as fh: badmap = mmap(fh.fileno(), 0, prot=PROT_READ) for ix, v in enumerate(imap(ord, badmap)): if v: for jx in range(8): if v & (1 << jx): r.println('%5d %d' % ( sliceno, ix * 8 + jx, )) reported_count += 1 if reported_count >= 32: break if reported_count >= 32: break badmap.close() unlink(fn) if reported_count >= 32: r.println('...') r.line() res.bad_line_count_per_column = {} r.println('Bad line count Column') for colname in sorted(analysis_res[0][0]): cnt = sum(data[0][colname] for data in analysis_res) r.println('%14d %s' % ( cnt, colname, )) res.bad_line_count_per_column[colname] = cnt r.line() else: num_lines_per_split = d.lines dw = prepare_res for sliceno, count in enumerate(num_lines_per_split): dw.set_lines(sliceno, count) if options.defaults: r.println('Defaulted values') res.defaulted_per_slice = {} res.defaulted_total = {} for colname in sorted(options.defaults): r.println(' %s:' % (colname, )) r.println(' Slice Defaulted line count') res.defaulted_per_slice[colname] = [ data[2][colname] for data in analysis_res ] res.defaulted_total[colname] = sum( res.defaulted_per_slice[colname]) for sliceno, cnt in enumerate(res.defaulted_per_slice[colname]): r.println(' %5d %d' % ( sliceno, cnt, )) r.println(' total %d' % (res.defaulted_total[colname], )) r.line() for sliceno, data in enumerate(analysis_res): dw.set_minmax(sliceno, data[3]) d = dw.finish() res.good_line_count_per_slice = num_lines_per_split res.good_line_count_total = sum(num_lines_per_split) r.line() r.println('Total of %d lines converted' % (res.good_line_count_total, )) r.close() json_save(res)
def real_synthesis(params, options, datasets, minmax_index, prepare_res, we_have_spill, save_discard=False): stats = DotDict( included_lines=[0] * params.slices, discarded_lines=[0] * params.slices, spilled_lines=[0] * params.slices, virtually_spilled_lines=[0] * params.slices, split_date=str(options.split_date) if options.split_date else None, discard_before_date=str(options.discard_before_date) if options.discard_before_date else None, ) minmax_per_slice = [{} for _ in range(params.slices)] def update_stats(data): for item in data.itervalues(): stats.included_lines[sliceno] += item.counters[2] stats.discarded_lines[sliceno] += item.counters[1] if item.virtual_spill: stats.virtually_spilled_lines[sliceno] += item.counters[3] else: stats.spilled_lines[sliceno] += item.counters[3] update_minmax(minmax_per_slice[sliceno], item.minmax) def update_minmax(dest, src): for name, lst0 in src.iteritems(): lst1 = dest.get(name, lst0) mins = map(min, zip(lst0[:3], lst1[:3])) maxs = map(max, zip(lst0[3:], lst1[3:])) dest[name] = mins + maxs for sliceno in range(params.slices): update_stats(blob.load('stats', sliceno=sliceno)) minmax = {} for item in minmax_per_slice: update_minmax(minmax, item) def minmax_select(offset, stringify=False): d = {} for k, v in minmax.iteritems(): mn = v[offset] mx = v[3 + offset] if mn <= mx: if stringify and isinstance(mn, ( date, time, )): d[k] = [str(mn), str(mx)] else: d[k] = [mn, mx] return d dw, dw_spill = prepare_res[:2] dw.set_minmax(None, minmax_select(minmax_index)) dw_spill.set_minmax(None, minmax_select(2)) if save_discard: included_lines = stats.discarded_lines else: included_lines = stats.included_lines for sliceno in range(params.slices): dw.set_lines(sliceno, included_lines[sliceno]) dw_spill.set_lines(sliceno, stats.spilled_lines[sliceno]) if not we_have_spill: dw_spill.discard() stats.minmax_discarded = minmax_select(0, True) stats.minmax = minmax_select(1, True) stats.minmax_spilled = minmax_select(2, True) json_save(stats)
def process_one(sliceno, options, source, prepare_res, data=None, save_discard=False): # Future improvement: Look at the old minmax to determine if we will get anything from reading this data dw, dw_spill, column_names, column_sizes, column_types, minmax_typeidx = prepare_res if data: assert data.version == 1 data.seen_before = True else: data = empty_spilldata() d = Dataset(source, data.spill_ds) in_files = [] out_files = [] offsets = [] if not save_discard: out_files += [ffi.NULL] * len( column_names) # don't save "too old" lines minmax_files = [] minmax_d = {} for colname in column_names: out_fn = dw.column_filename(colname, sliceno).encode('ascii') in_fn = d.column_filename(colname, sliceno).encode('ascii') offset = d.columns[colname].offsets[sliceno] if d.columns[ colname].offsets else 0 in_files.append(ffi.new('char []', in_fn)) out_files.append(ffi.new('char []', out_fn)) offsets.append(offset) minmax_fn = out_fn + '_minmax' minmax_files.append(ffi.new('char []', minmax_fn)) minmax_d[colname] = minmax_fn if save_discard: out_files += [ffi.NULL] * len( column_names) # don't save "good" lines (save discard instead) date_coltype = column_types[options.date_column] def date2cfmt(dt): if date_coltype == 'datetime': date0 = (dt.year << 14) | (dt.month << 10) | ( dt.day << 5) | dt.hour date1 = (dt.minute << 26) | (dt.second << 20) | dt.microsecond elif date_coltype == 'date': date0 = (dt.year << 9) | (dt.month << 5) | dt.day date1 = 0 elif date_coltype == 'time': date0 = 32277536 | dt.hour date1 = (dt.minute << 26) | (dt.second << 20) | dt.microsecond else: raise Exception('Bad date_coltype type: ' + date_coltype) return date0, date1 dates = [0, 0, 0, 0, 0xffffffff, 0xffffffff] stats = DotDict() if data.seen_before: dates[0:2] = date2cfmt(data.get('process_date', datetime.min)) if (data.last_time or options.hard_spill) and not save_discard: for colname in column_names: out_fn = dw_spill.column_filename(colname, sliceno).encode('ascii') out_files.append(ffi.new('char []', out_fn)) stats.virtual_spill = False else: # We still have to make sure the files exist, or we end up # with a broken dataset if only some slices wanted to spill. for colname in column_names: open(dw_spill.column_filename(colname, sliceno), 'ab').close() out_files += [ffi.NULL] * len(column_names) stats.virtual_spill = True # We are done reading `data` - update it for next iteration del data.seen_before data.process_date = datetime.min if options.discard_before_date: if options.split_date: assert options.discard_before_date < options.split_date dates[2:3] = date2cfmt(options.discard_before_date) data.process_date = options.discard_before_date if options.split_date: dates[4:6] = date2cfmt(options.split_date) data.process_date = max(data.process_date, options.split_date) counters = ffi.new('uint64_t [4]') # one for each class-enum res = backend.filter(len(in_files), in_files, offsets, out_files, minmax_files, column_sizes, counters, dates, minmax_typeidx, d.lines[sliceno]) assert not res, "cffi converter returned error on data from " + source stats.version = 0 stats.counters = list(counters) stats.minmax = {} for colname, fn in minmax_d.iteritems(): if exists(fn): with type2iter[column_types[colname]](fn) as it: stats.minmax[colname] = list(it) unlink(fn) # If there is at most 2% left, spill it next time. # Or if there is at most 10% left and we have read it at least 8 times. # Or if there is at most 20% left and we have read it at least 16 times. # A reasonable balance between re-reading and re-writing, one hopes. data.counter += 1 total_lines = sum(counters) data.last_time = (counters[3] <= total_lines / 50 or (data.counter >= 8 and counters[3] <= total_lines / 10) or (data.counter >= 16 and counters[3] <= total_lines / 5)) # If no lines were spilled we will not need this dataset again, # nor if we wrote the spill in this dataset. if not counters[3] or not stats.virtual_spill: data = None return data, stats
def statmsg_sink(logfilename, sock): from extras import DotDict print('Logging to "%s".' % (logfilename,)) with open(logfilename, 'w', encoding='utf-8') as fh: ix = 0 while True: data = None try: data = sock.recv(1500) typ, pid, msg = data.decode('utf-8').split('\0', 2) pid = int(pid) with status_stacks_lock: if typ == 'push': msg, t, cookie = msg.split('\0', 3) t = float(t) status_all[pid].stack.append((msg, t, cookie)) elif typ == 'pop': stack, ix = _find(pid, msg) if ix == len(stack) - 1: stack.pop() else: print('POP OF WRONG STATUS: %d:%s (index %s of %d)' % (pid, msg, ix, len(stack))) elif typ == 'update': msg, _, cookie = msg.split('\0', 3) stack, ix = _find(pid, cookie) if ix is None: print('UPDATE TO UNKNOWN STATUS %d:%s: %s' % (pid, cookie, msg)) else: stack[ix] = (msg, stack[ix][1], cookie) elif typ == 'start': parent_pid, is_analysis, msg, t = msg.split('\0', 3) parent_pid = int(parent_pid) t = float(t) d = DotDict(_default=None) d.parent_pid = parent_pid d.children = {} d.stack = [(msg, t, None)] d.summary = (t, msg, t,) if parent_pid in status_all: if is_analysis: msg, parent_t, _ = status_all[parent_pid].stack[0] d.summary = (parent_t, msg + ' analysis', t,) status_all[parent_pid].children[pid] = d else: status_tree[pid] = d status_all[pid] = d del d elif typ == 'end': d = status_all.get(pid) if d: if d.parent_pid in status_all: p = status_all[d.parent_pid] if pid in p.children: del p.children[pid] del p del d if pid in status_tree: del status_tree[pid] elif typ == 'statmsg': fh.write('%s %5d: %s\n' % (strftime("%Y-%m-%d %H:%M:%S"), ix, msg,)) fh.flush() ix += 1 else: print('UNKNOWN MESSAGE: %r' % (data,)) except Exception: print('Failed to process %r:' % (data,)) print_exc()
def load_methods(data): res_warnings = [] res_failed = [] res_hashes = {} res_params = {} for package, key in data: filename = '%s/a_%s.py' % ( package, key, ) modname = '%s.a_%s' % (package, key) try: with open(filename, 'rb') as fh: src = fh.read() tar_fh = io.BytesIO() tar_o = tarfile.open(mode='w:gz', fileobj=tar_fh, compresslevel=1) tar_o.add(filename, arcname='a_%s.py' % (key, )) h = hashlib.sha1(src) hash = int(h.hexdigest(), 16) mod = import_module(modname) prefix = os.path.dirname(mod.__file__) + '/' likely_deps = set() for k in dir(mod): v = getattr(mod, k) if isinstance(v, ModuleType): dep = getattr(v, '__file__', '') if dep.startswith(prefix): dep = os.path.basename(dep) if dep[-4:] in ( '.pyc', '.pyo', ): dep = dep[:-1] likely_deps.add(dep) hash_extra = 0 for dep in getattr(mod, 'depend_extra', ()): if isinstance(dep, ModuleType): dep = dep.__file__ if dep[-4:] in ( '.pyc', '.pyo', ): dep = dep[:-1] if isinstance(dep, str): if not dep.startswith('/'): dep = prefix + dep with open(dep, 'rb') as fh: hash_extra ^= int( hashlib.sha1(fh.read()).hexdigest(), 16) bn = os.path.basename(dep) likely_deps.discard(bn) tar_o.add(dep, arcname=bn) else: raise Exception('Bad depend_extra in %s.a_%s: %r' % ( package, key, dep, )) for dep in likely_deps: res_warnings.append( '%s.a_%s should probably depend_extra on %s' % ( package, key, dep[:-3], )) res_hashes[key] = ("%040x" % (hash ^ hash_extra, ), ) res_params[key] = params = DotDict() for name, default in ( ( 'options', {}, ), ( 'datasets', (), ), ( 'jobids', (), ), ): params[name] = getattr(mod, name, default) equivalent_hashes = getattr(mod, 'equivalent_hashes', ()) if equivalent_hashes: assert isinstance( equivalent_hashes, dict), 'Read the docs about equivalent_hashes' assert len(equivalent_hashes ) == 1, 'Read the docs about equivalent_hashes' k, v = next(iteritems(equivalent_hashes)) assert isinstance(k, str), 'Read the docs about equivalent_hashes' assert isinstance( v, tuple), 'Read the docs about equivalent_hashes' for v in v: assert isinstance( v, str), 'Read the docs about equivalent_hashes' start = src.index('equivalent_hashes') end = src.index('}', start) h = hashlib.sha1(src[:start]) h.update(src[end:]) verifier = "%040x" % (int(h.hexdigest(), 16) ^ hash_extra, ) if verifier in equivalent_hashes: res_hashes[key] += equivalent_hashes[verifier] else: res_warnings.append( '%s.a_%s has equivalent_hashes, but missing verifier %s' % ( package, key, verifier, )) tar_o.close() tar_fh.seek(0) archives[key] = tar_fh.read() except Exception: print_exc() res_failed.append(modname) continue return res_warnings, res_failed, res_hashes, res_params
def execute_process(workdir, jobid, slices, result_directory, common_directory, source_directory, index=None, workspaces=None, daemon_url=None, subjob_cookie=None, parent_pid=0): g.JOBID = jobid setproctitle('launch') path = os.path.join(workdir, jobid) try: os.chdir(path) except Exception: print("Cannot cd to workdir", path) exit(1) g.params = params = job_params() method_ref = import_module(params.package + '.a_' + params.method) g.sliceno = -1 if workspaces: jobid_module.put_workspaces(workspaces) def maybe_dataset(v): if isinstance(v, list): return [maybe_dataset(e) for e in v] if not v: return '' try: return dataset.Dataset(v) except IOError: return v datasets = DotDict( {k: maybe_dataset(v) for k, v in params.datasets.items()}) g.options = params.options g.datasets = datasets g.jobids = params.jobids method_ref.options = params.options method_ref.datasets = datasets method_ref.jobids = params.jobids # compatibility names g.SLICES = slices g.JOBID = jobid g.jobid = jobid g.METHOD = params.method g.WORKSPACEPATH = workdir g.CAPTION = params.caption g.PACKAGE = params.package g.RESULT_DIRECTORY = result_directory g.COMMON_DIRECTORY = common_directory g.SOURCE_DIRECTORY = source_directory g.index = -1 g.daemon_url = daemon_url g.running = 'launch' status._start('%s %s' % ( jobid, params.method, ), parent_pid) def dummy(): pass prepare_func = getattr(method_ref, 'prepare', dummy) analysis_func = getattr(method_ref, 'analysis', dummy) synthesis_func = getattr(method_ref, 'synthesis', dummy) synthesis_needs_analysis = 'analysis_res' in getargspec( synthesis_func).args # A chain must be finished from the back, so sort on that. sortnum_cache = {} def dw_sortnum(name): if name not in sortnum_cache: dw = dataset._datasetwriters[name] if dw.previous and dw.previous.startswith(jobid + '/'): pname = dw.previous.split('/')[1] num = dw_sortnum(pname) + 1 else: num = 0 sortnum_cache[name] = num return sortnum_cache[name] prof = {} if prepare_func is dummy: prof['prepare'] = 0 # truthish! else: t = time() g.running = 'prepare' g.subjob_cookie = subjob_cookie setproctitle(g.running) with status.status(g.running): g.prepare_res = method_ref.prepare(**args_for(method_ref.prepare)) to_finish = [ dw.name for dw in dataset._datasetwriters.values() if dw._started ] if to_finish: with status.status("Finishing datasets"): for name in sorted(to_finish, key=dw_sortnum): dataset._datasetwriters[name].finish() prof['prepare'] = time() - t setproctitle('launch') from extras import saved_files if analysis_func is dummy: prof['per_slice'] = [] prof['analysis'] = 0 else: t = time() g.running = 'analysis' g.subjob_cookie = None # subjobs are not allowed from analysis with status.status('Waiting for all slices to finish analysis'): prof['per_slice'], files, g.analysis_res = fork_analysis( slices, analysis_func, args_for(analysis_func), synthesis_needs_analysis) prof['analysis'] = time() - t saved_files.update(files) t = time() g.running = 'synthesis' g.subjob_cookie = subjob_cookie setproctitle(g.running) with status.status(g.running): synthesis_res = synthesis_func(**args_for(synthesis_func)) if synthesis_res is not None: blob.save(synthesis_res, temp=False) if dataset._datasetwriters: with status.status("Finishing datasets"): for name in sorted(dataset._datasetwriters, key=dw_sortnum): dataset._datasetwriters[name].finish() t = time() - t prof['synthesis'] = t from subjobs import _record status._end() return None, (prof, saved_files, _record)