def _save(self): if not os.path.exists(self.name): os.mkdir(self.name) blob.save(self._data, self._name('pickle'), temp=False) with open(self._name('txt'), 'w', encoding='utf-8') as fh: nl = False if self.hashlabel: fh.write('hashlabel %s\n' % (self.hashlabel, )) nl = True if self.previous: fh.write('previous %s\n' % (self.previous, )) nl = True if nl: fh.write('\n') col_list = sorted(( k, c.type, c.location, ) for k, c in self.columns.items()) lens = tuple( max(minlen, max(len(t[i]) for t in col_list)) for i, minlen in ((0, 4), (1, 4), (2, 8))) template = '%%%ds %%%ds %%-%ds\n' % lens fh.write(template % ('name', 'type', 'location')) fh.write(template % tuple('=' * l for l in lens)) for t in col_list: fh.write(template % t)
def analysis(sliceno, prepare_res): stats = {} prev_spilldata = blob.load('spilldata', jobid=datasets.source, sliceno=sliceno) source_params = job_params(datasets.source) for source, data in prev_spilldata: _, stats[source] = a_dataset_datesplit.process_one( sliceno, source_params.options, source, prepare_res, data, save_discard=True) source_params = job_params(datasets.source) prev_params = job_params(source_params.datasets.previous, default_empty=True) for source in Dataset(source_params.datasets.source).chain( stop_ds=prev_params.datasets.source): _, stats[source] = a_dataset_datesplit.process_one( sliceno, source_params.options, source, prepare_res, save_discard=True) blob.save(stats, 'stats', sliceno=sliceno, temp=False)
def analysis(sliceno, params, prepare_res): spilldata = {} stats = {} we_have_spill = False if datasets.previous: prev_spilldata = blob.load('spilldata', jobid=datasets.previous, sliceno=sliceno) for source, data in prev_spilldata: spilldata[source], stats[source] = process_one( sliceno, options, source, prepare_res, data) we_have_spill |= not stats[source].virtual_spill if datasets.source: prev_params = job_params(datasets.previous, default_empty=True) for source in datasets.source.chain( stop_ds=prev_params.datasets.source): spilldata[source], stats[source] = process_one( sliceno, options, source, prepare_res) we_have_spill |= not stats[source].virtual_spill spilldata = [(k, v) for k, v in spilldata.iteritems() if v] if we_have_spill: spilldata.append((params.jobid, empty_spilldata('SPILL'))) blob.save(spilldata, 'spilldata', sliceno=sliceno, temp=False) blob.save(stats, 'stats', sliceno=sliceno, temp=False) return we_have_spill
def call_analysis(analysis_func, sliceno_, q, preserve_result, parent_pid, **kw): try: status._start('analysis(%d)' % (sliceno_,), parent_pid, 't') os.close(_prof_fd) for stupid_inconsistent_name in ('sliceno', 'index'): if stupid_inconsistent_name in kw: kw[stupid_inconsistent_name] = sliceno_ setattr(g, stupid_inconsistent_name, sliceno_) for dw in dataset._datasetwriters.values(): if dw._for_single_slice is None: dw._set_slice(sliceno_) res = analysis_func(**kw) if preserve_result: # Remove defaultdicts until we find one with a picklable default_factory. # (This is what you end up doing manually anyway.) def picklable(v): try: pickle.dumps(v, pickle.HIGHEST_PROTOCOL) return True except Exception: return False def fixup(d): if isinstance(d, defaultdict) and not picklable(d.default_factory): if not d: return {} v = next(iteritems(d)) if isinstance(v, defaultdict) and not picklable(v.default_factory): return {k: fixup(v) for k, v in iteritems(d)} else: return dict(d) else: return d def save(item, name): blob.save(fixup(item), name, sliceno=sliceno_, temp=True) if isinstance(res, tuple): if sliceno_ == 0: blob.save(len(res), "Analysis.tuple", temp=True) for ix, item in enumerate(res): save(item, "Analysis.%d." % (ix,)) else: if sliceno_ == 0: blob.save(False, "Analysis.tuple", temp=True) save(res, "Analysis.") from extras import saved_files dw_lens = {} dw_minmax = {} for name, dw in dataset._datasetwriters.items(): if dw._for_single_slice in (None, sliceno_,): dw.close() dw_lens[name] = dw._lens dw_minmax[name] = dw._minmax status._end() q.put((sliceno_, time(), saved_files, dw_lens, dw_minmax, None,)) except: status._end() q.put((sliceno_, time(), {}, {}, {}, fmt_tb(1),)) print_exc() sleep(5) # give launcher time to report error (and kill us) exitfunction()
def one_slice(sliceno): first = True updater = globals()['upd_' + options.flavour] for pickle in options.pickles: tmp = load(pickle, sliceno=sliceno) if first: res = tmp first = False else: updater(res, tmp) save(res, options.resultname, sliceno=sliceno)
def synthesis(prepare_res, analysis_res, params): from math import sqrt separator, filename, orig_filename, labels, dw = prepare_res labels = [n for n in labels if n not in options.discard] if filename != orig_filename: os.unlink(filename) # aggregate typing and statistics res = {} res['num_broken_lines'] = 0 res['num_lines'] = 0 res['lines_per_slice'] = [] for sliceno, tmp in enumerate(analysis_res): res['num_broken_lines'] += tmp['num_broken_lines'] res['num_lines'] += tmp['num_lines'] res['lines_per_slice'].append(tmp['num_lines']) dw.set_lines(sliceno, tmp['num_lines']) blob.save(res, 'import') # write report r = report.report() if not res['num_lines']: r.println('No lines read - empty file!') r.close() return r.println('Number of rows read\n') r.println(' slice lines') for sliceno, nlines in enumerate(res['lines_per_slice']): if res['num_lines']: r.println(' %2d %9d (%6.2f%%)' % (sliceno, nlines, 100 * nlines / res['num_lines'])) else: r.println(' %2d %9d ' % (sliceno, nlines, 100 * nlines / res['num_lines'])) r.println(' total %9d' % (res['num_lines'], )) stdev = sqrt( sum((x - res['num_lines'] / params.slices)**2 for x in res['lines_per_slice']) / params.slices) r.println('\n hash stdev %9d (%6.2f%%)' % (stdev, round(100 * stdev / res['num_lines']))) r.line() r.println('Number of columns %9d' % len(labels, )) r.close() if res['num_broken_lines'] and not options.allow_bad: raise Exception('%d bad lines without options.allow_bad' % (res['num_broken_lines'], ))
def save_datastore(self): if self._key: return value = self._f.read() value = base64.b64encode(value) value = value + " " * (16 - len(value) % 16) value = self.aes.encrypt(value) self._key = blob.save(value)
def analysis(sliceno, prepare_res): key_filter, value_filter = prepare_res d = blob.load(jobid=jobids.previous, sliceno=sliceno, default=defaultdict(set)) if options.key_filter: d = {k: v for k, v in d.iteritems() if k in key_filter} iterator = datasets.source.iterate_chain( sliceno, ( options.key_column, options.value_column, ), stop_jobid={jobids.previous: 'source'}, ) # These break out into four versions for shorter runtime if options.value_filter: # Remove anything that's not in the filter for k, v in d.items(): v = v & value_filter if v: d[k] = v else: del d[k] # This lets us reuse the same str object for the same value (smaller pickles) value_filter = {v: v for v in value_filter} if options.key_filter: for k, v in iterator: if k in key_filter and v in value_filter: d[k].add(value_filter[v]) else: for k, v in iterator: if v in value_filter: d[k].add(value_filter[v]) else: reuse = {} if options.key_filter: for k, v in iterator: if k in key_filter: d[k].add(reuse.setdefault(v, v)) else: for k, v in iterator: d[k].add(reuse.setdefault(v, v)) blob.save(d, sliceno=sliceno, temp=False) blob.save(set(d), 'keyset', sliceno=sliceno, temp=False) blob.save(Counter(len(v) for v in d.itervalues()), 'setsizehist', sliceno=sliceno, temp=False)
def save(item, name): blob.save(fixup(item), name, sliceno=sliceno_, temp=True)
def execute_process(workdir, jobid, slices, result_directory, common_directory, source_directory, index=None, workspaces=None, daemon_url=None, subjob_cookie=None, parent_pid=0): g.JOBID = jobid setproctitle('launch') path = os.path.join(workdir, jobid) try: os.chdir(path) except Exception: print("Cannot cd to workdir", path) exit(1) g.params = params = job_params() method_ref = import_module(params.package + '.a_' + params.method) g.sliceno = -1 if workspaces: jobid_module.put_workspaces(workspaces) def maybe_dataset(v): if isinstance(v, list): return [maybe_dataset(e) for e in v] if not v: return '' try: return dataset.Dataset(v) except IOError: return v datasets = DotDict( {k: maybe_dataset(v) for k, v in params.datasets.items()}) g.options = params.options g.datasets = datasets g.jobids = params.jobids method_ref.options = params.options method_ref.datasets = datasets method_ref.jobids = params.jobids # compatibility names g.SLICES = slices g.JOBID = jobid g.jobid = jobid g.METHOD = params.method g.WORKSPACEPATH = workdir g.CAPTION = params.caption g.PACKAGE = params.package g.RESULT_DIRECTORY = result_directory g.COMMON_DIRECTORY = common_directory g.SOURCE_DIRECTORY = source_directory g.index = -1 g.daemon_url = daemon_url g.running = 'launch' status._start('%s %s' % ( jobid, params.method, ), parent_pid) def dummy(): pass prepare_func = getattr(method_ref, 'prepare', dummy) analysis_func = getattr(method_ref, 'analysis', dummy) synthesis_func = getattr(method_ref, 'synthesis', dummy) synthesis_needs_analysis = 'analysis_res' in getarglist(synthesis_func) # A chain must be finished from the back, so sort on that. sortnum_cache = {} def dw_sortnum(name): if name not in sortnum_cache: dw = dataset._datasetwriters[name] if dw.previous and dw.previous.startswith(jobid + '/'): pname = dw.previous.split('/')[1] num = dw_sortnum(pname) + 1 else: num = 0 sortnum_cache[name] = num return sortnum_cache[name] prof = {} if prepare_func is dummy: prof['prepare'] = 0 # truthish! else: t = time() g.running = 'prepare' g.subjob_cookie = subjob_cookie setproctitle(g.running) with status.status(g.running): g.prepare_res = method_ref.prepare(**args_for(method_ref.prepare)) to_finish = [ dw.name for dw in dataset._datasetwriters.values() if dw._started ] if to_finish: with status.status("Finishing datasets"): for name in sorted(to_finish, key=dw_sortnum): dataset._datasetwriters[name].finish() prof['prepare'] = time() - t setproctitle('launch') from extras import saved_files if analysis_func is dummy: prof['per_slice'] = [] prof['analysis'] = 0 else: t = time() g.running = 'analysis' g.subjob_cookie = None # subjobs are not allowed from analysis with status.status( 'Waiting for all slices to finish analysis') as update: g.update_top_status = update prof['per_slice'], files, g.analysis_res = fork_analysis( slices, analysis_func, args_for(analysis_func), synthesis_needs_analysis) del g.update_top_status prof['analysis'] = time() - t saved_files.update(files) t = time() g.running = 'synthesis' g.subjob_cookie = subjob_cookie setproctitle(g.running) with status.status(g.running): synthesis_res = synthesis_func(**args_for(synthesis_func)) if synthesis_res is not None: blob.save(synthesis_res, temp=False) if dataset._datasetwriters: with status.status("Finishing datasets"): for name in sorted(dataset._datasetwriters, key=dw_sortnum): dataset._datasetwriters[name].finish() t = time() - t prof['synthesis'] = t from subjobs import _record status._end() return None, (prof, saved_files, _record)
def synthesis(params): setsizehist = Counter() for sliceno in range(params.slices): setsizehist.update(blob.load('setsizehist', sliceno=sliceno)) blob.save(setsizehist, 'setsizehist')
def save_datastore(self): if self._key: return self.file.seek(0) self._key = blob.save(self.file.read())