def analysis(sliceno, prepare_res): stats = {} prev_spilldata = blob.load('spilldata', jobid=datasets.source, sliceno=sliceno) source_params = job_params(datasets.source) for source, data in prev_spilldata: _, stats[source] = a_dataset_datesplit.process_one( sliceno, source_params.options, source, prepare_res, data, save_discard=True) source_params = job_params(datasets.source) prev_params = job_params(source_params.datasets.previous, default_empty=True) for source in Dataset(source_params.datasets.source).chain( stop_ds=prev_params.datasets.source): _, stats[source] = a_dataset_datesplit.process_one( sliceno, source_params.options, source, prepare_res, save_discard=True) blob.save(stats, 'stats', sliceno=sliceno, temp=False)
def prepare(params): d = datasets.source caption = options.caption % dict(caption=d.caption, hashlabel=options.hashlabel) prev_p = job_params(datasets.previous, default_empty=True) prev_source = prev_p.datasets.source if len(d.chain(stop_jobid=prev_source, length=options.length)) == 1: filename = d.filename else: filename = None dws = [] previous = datasets.previous for sliceno in range(params.slices): if options.as_chain and sliceno == params.slices - 1: name = "default" else: name = str(sliceno) dw = DatasetWriter( caption="%s (slice %d)" % (caption, sliceno), hashlabel=options.hashlabel, filename=filename, previous=previous, name=name, for_single_slice=sliceno, ) previous = (params.jobid, name) dws.append(dw) names = [] for n, c in d.columns.items(): # names has to be in the same order as the add calls # so the iterator returns the same order the writer expects. names.append(n) for dw in dws: dw.add(n, c.type) return dws, names, prev_source, caption, filename
def analysis(sliceno, params, prepare_res): spilldata = {} stats = {} we_have_spill = False if datasets.previous: prev_spilldata = blob.load('spilldata', jobid=datasets.previous, sliceno=sliceno) for source, data in prev_spilldata: spilldata[source], stats[source] = process_one( sliceno, options, source, prepare_res, data) we_have_spill |= not stats[source].virtual_spill if datasets.source: prev_params = job_params(datasets.previous, default_empty=True) for source in datasets.source.chain( stop_ds=prev_params.datasets.source): spilldata[source], stats[source] = process_one( sliceno, options, source, prepare_res) we_have_spill |= not stats[source].virtual_spill spilldata = [(k, v) for k, v in spilldata.iteritems() if v] if we_have_spill: spilldata.append((params.jobid, empty_spilldata('SPILL'))) blob.save(spilldata, 'spilldata', sliceno=sliceno, temp=False) blob.save(stats, 'stats', sliceno=sliceno, temp=False) return we_have_spill
def _get_params(jobid): try: return jobid, job_params(jobid) except: from traceback import print_exc print_exc() raise
def _job_candidates_options(candidates): for jobid, remset in iteritems(candidates): setup = job_params(jobid) optdiff = defaultdict(dict) for thing in remset: section, name = thing.split('-', 1) optdiff[section][name] = setup[section][name] yield jobid, optdiff
def __new__(cls, jobid, name=None): if isinstance(jobid, (tuple, list)): jobid = _dsid(jobid) elif isinstance(jobid, dict): assert not name, "Don't pass both a separate name and jobid as {job: dataset}" assert len(jobid) == 1, "Only pass a single {job: dataset}" jobid, dsname = next(iteritems(jobid)) if not jobid: return None jobid = job_params(jobid, default_empty=True).datasets.get(dsname) if not jobid: return None if '/' in jobid: assert not name, "Don't pass both a separate name and jobid as jid/name" jobid, name = jobid.split('/', 1) assert jobid, "If you really meant to use yourself as a dataset, pass params.jobid explicitly." name = uni(name or 'default') assert '/' not in name if name == 'default': suffix = '' else: suffix = '/' + name if jobid is _new_dataset_marker: from g import JOBID fullname = JOBID + suffix else: fullname = jobid + suffix obj = unicode.__new__(cls, fullname) obj.name = uni(name or 'default') if jobid is _new_dataset_marker: obj._data = DotDict({ 'version': ( 2, 2, ), 'filename': None, 'hashlabel': None, 'caption': '', 'columns': {}, 'parent': None, 'previous': None, 'lines': [], }) obj.jobid = None else: obj.jobid = jobid obj._data = DotDict(_ds_load(obj)) assert obj._data.version[0] == 2 and obj._data.version[ 1] >= 2, "%s/%s: Unsupported dataset pickle version %r" % ( jobid, name, obj._data.version, ) obj._data.columns = dict(obj._data.columns) return obj
def synthesis(params, prepare_res): source_params = job_params(datasets.source) source_params.options.caption = options.caption a_dataset_datesplit.real_synthesis(params, source_params.options, source_params.datasets, 0, prepare_res, False, save_discard=True) stats = json_load() json_save(dict( minmax = stats.minmax_discarded, included_lines = stats.discarded_lines, split_date = stats.split_date, discard_before_date = stats.discard_before_date, ))
def csvexport(sliceno, filename, labelsonfirstline): assert len(options.separator) == 1 assert options.quote_fields in ('', "'", '"',) d = datasets.source[0] if not options.labels: options.labels = sorted(d.columns) if options.chain_source: if jobids.previous: prev_source = job_params(jobids.previous).datasets.source assert len(datasets.source) == len(prev_source) else: prev_source = [None] * len(datasets.source) lst = [] for src, stop in zip(datasets.source, prev_source): lst.extend(src.chain(stop_jobid=stop)) datasets.source = lst if filename.lower().endswith('.gz'): mkwrite = GzWrite elif filename.lower().endswith('.csv'): def mkwrite(filename): return open(filename, "wb") else: raise Exception("Filename should end with .gz for compressed or .csv for uncompressed") iters = [] first = True for label in options.labels: it = d.iterate_list(sliceno, label, datasets.source, status_reporting=first) first = False t = d.columns[label].type if t == 'unicode': it = imap(lambda s: s.encode('utf-8'), it) elif t in ('float32', 'float64', 'number'): it = imap(repr, it) elif t == 'json': it = imap(dumps, it) elif t not in ('ascii', 'bytes'): it = imap(str, it) iters.append(it) it = izip(*iters) with mkwrite(filename) as fh: q = options.quote_fields sep = options.separator if q: qq = q + q if labelsonfirstline: fh.write((sep.join(q + n.replace(q, qq) + q for n in options.labels) + '\n').encode('utf-8')) for data in it: fh.write(sep.join(q + n.replace(q, qq) + q for n in data) + '\n') else: if labelsonfirstline: fh.write((sep.join(options.labels) + '\n').encode('utf-8')) for data in it: fh.write(sep.join(data) + '\n')
def execute_process(workdir, jobid, slices, result_directory, common_directory, source_directory, index=None, workspaces=None, daemon_url=None, subjob_cookie=None, parent_pid=0): g.JOBID = jobid setproctitle('launch') path = os.path.join(workdir, jobid) try: os.chdir(path) except Exception: print("Cannot cd to workdir", path) exit(1) g.params = params = job_params() method_ref = import_module(params.package + '.a_' + params.method) g.sliceno = -1 if workspaces: jobid_module.put_workspaces(workspaces) def maybe_dataset(v): if isinstance(v, list): return [maybe_dataset(e) for e in v] if not v: return '' try: return dataset.Dataset(v) except IOError: return v datasets = DotDict( {k: maybe_dataset(v) for k, v in params.datasets.items()}) g.options = params.options g.datasets = datasets g.jobids = params.jobids method_ref.options = params.options method_ref.datasets = datasets method_ref.jobids = params.jobids # compatibility names g.SLICES = slices g.JOBID = jobid g.jobid = jobid g.METHOD = params.method g.WORKSPACEPATH = workdir g.CAPTION = params.caption g.PACKAGE = params.package g.RESULT_DIRECTORY = result_directory g.COMMON_DIRECTORY = common_directory g.SOURCE_DIRECTORY = source_directory g.index = -1 g.daemon_url = daemon_url g.running = 'launch' status._start('%s %s' % ( jobid, params.method, ), parent_pid) def dummy(): pass prepare_func = getattr(method_ref, 'prepare', dummy) analysis_func = getattr(method_ref, 'analysis', dummy) synthesis_func = getattr(method_ref, 'synthesis', dummy) synthesis_needs_analysis = 'analysis_res' in getarglist(synthesis_func) # A chain must be finished from the back, so sort on that. sortnum_cache = {} def dw_sortnum(name): if name not in sortnum_cache: dw = dataset._datasetwriters[name] if dw.previous and dw.previous.startswith(jobid + '/'): pname = dw.previous.split('/')[1] num = dw_sortnum(pname) + 1 else: num = 0 sortnum_cache[name] = num return sortnum_cache[name] prof = {} if prepare_func is dummy: prof['prepare'] = 0 # truthish! else: t = time() g.running = 'prepare' g.subjob_cookie = subjob_cookie setproctitle(g.running) with status.status(g.running): g.prepare_res = method_ref.prepare(**args_for(method_ref.prepare)) to_finish = [ dw.name for dw in dataset._datasetwriters.values() if dw._started ] if to_finish: with status.status("Finishing datasets"): for name in sorted(to_finish, key=dw_sortnum): dataset._datasetwriters[name].finish() prof['prepare'] = time() - t setproctitle('launch') from extras import saved_files if analysis_func is dummy: prof['per_slice'] = [] prof['analysis'] = 0 else: t = time() g.running = 'analysis' g.subjob_cookie = None # subjobs are not allowed from analysis with status.status( 'Waiting for all slices to finish analysis') as update: g.update_top_status = update prof['per_slice'], files, g.analysis_res = fork_analysis( slices, analysis_func, args_for(analysis_func), synthesis_needs_analysis) del g.update_top_status prof['analysis'] = time() - t saved_files.update(files) t = time() g.running = 'synthesis' g.subjob_cookie = subjob_cookie setproctitle(g.running) with status.status(g.running): synthesis_res = synthesis_func(**args_for(synthesis_func)) if synthesis_res is not None: blob.save(synthesis_res, temp=False) if dataset._datasetwriters: with status.status("Finishing datasets"): for name in sorted(dataset._datasetwriters, key=dw_sortnum): dataset._datasetwriters[name].finish() t = time() - t prof['synthesis'] = t from subjobs import _record status._end() return None, (prof, saved_files, _record)
def prepare(): source_params = job_params(datasets.source) return a_dataset_datesplit.real_prepare(datasets.source, datasets.previous, source_params.options)
def main(urd): resetlocale() if False: # One BILLION rows # This takes about half an hour on a fast machine num_rows = int(1e7) num_datasets = 100 else: # One MILLION rows num_rows = int(1e6) num_datasets = 10 # Create datasets print("\x1b[1m(1) Create chain of datasets.\x1b[m") jid = None for _ in range(num_datasets): jid = urd.build('example_perf_gendata', options=dict(num_rows=num_rows), datasets=dict(previous=jid)) # Export chain of datasets to CSV-file. print("\x1b[1m(2) Export dataset chain to CSV file.\x1b[m") jid = urd.build('csvexport', datasets=dict(source=jid), options=dict(filename='out.csv.gz', chain_source=True)) filename = resolve_jobid_filename(jid, 'out.csv.gz') print('Exported file stored in \"%s\"' % (filename, )) # Import and type previously exported CSV-file. print("\x1b[1m(3) Import dataset from CVS file.\x1b[m") jid = urd.build('csvimport', options=dict(filename=filename)) opts = dict( column2type={ 'a string': 'ascii', 'large number': 'number', 'small number': 'number', 'small integer': 'int32_10', # you must specify base for integers 'gauss number': 'number', 'gauss float': 'float64', }, ) print("\x1b[1m(4) Type imported dataset.\x1b[m") jid = urd.build('dataset_type', datasets=dict(source=jid), options=opts) # Sum all values in a column. Repeat for a set of columns with different types. print("\x1b[1m(5) Run some methods on the typed dataset.\x1b[m") jid_single = jid source = jid_single for colname in ('small number', 'small integer', 'large number', 'gauss number', 'gauss float'): print(colname) jid = urd.build('example_perf_sum', datasets=dict(source=source), options=dict(colname=colname), name='sum ' + colname) jid = urd.build('example_perf_sum_positive', datasets=dict(source=source), options=dict(colname=colname), name='sum positive ' + colname) # Compute histograms of a column print('histogram') jid = urd.build('example_perf_histogram', datasets=dict(source=source), options=dict(colname='gauss number'), name='histogram_number') jid = urd.build('example_perf_histogram', datasets=dict(source=source), options=dict(colname='gauss float'), name='histogram_float') # Find string print('find string') jid = urd.build('example_perf_find_string', datasets=dict(source=source), options=dict(colname='a string', text='ExAx'), name='find_string') print( "Number of lines containing string \"%s\" is %d." % (job_params(jid).options['text'], blob.load(jobid=jid)), ) # Print resulting profiling information from automata_common import profile_jobs print() def pl(text, time): print("%-30s %10.3f %14s" % ( text, time, '{0:n}'.format(round(num_rows * num_datasets / time)), )) print() print('-' * 56) print("operation exec time rows/s") print() pl('csvexport', profile_jobs(urd.joblist.find('csvexport'))) print() pl( 'reimport total', profile_jobs( urd.joblist.find('csvimport') + urd.joblist.find('dataset_type'))) pl(" csvimport ", profile_jobs(urd.joblist.find('csvimport'))) pl(" type ", profile_jobs(urd.joblist.find('dataset_type'))) print() print("sum") pl(" small number ", profile_jobs(urd.joblist.find('sum small number'))) pl(" small integer ", profile_jobs(urd.joblist.find('sum small integer'))) pl(" large number ", profile_jobs(urd.joblist.find('sum large number'))) pl(" gauss number ", profile_jobs(urd.joblist.find('sum gauss number'))) pl(" gauss float ", profile_jobs(urd.joblist.find('sum gauss float'))) print() print("sum positive") pl(" small number ", profile_jobs(urd.joblist.find('sum positive small number'))) pl(" small integer ", profile_jobs(urd.joblist.find('sum positive small integer'))) pl(" large number ", profile_jobs(urd.joblist.find('sum positive large number'))) pl(" gauss number ", profile_jobs(urd.joblist.find('sum positive gauss number'))) pl(" gauss float ", profile_jobs(urd.joblist.find('sum positive gauss float'))) print() print("histogram") pl(" number ", profile_jobs(urd.joblist.find('histogram_number'))) pl(" float ", profile_jobs(urd.joblist.find('histogram_float'))) print() pl("find string ", profile_jobs(urd.joblist.find('find_string'))) print() print("Total test time %10.3f" % (profile_jobs(urd.joblist), )) print() print('Example size is %s lines.' % ('{0:n}'.format(num_datasets * num_rows), )) print('Number of slices is %d.' % (urd.info.slices, )) print('-' * 56)
def x2opt(jobid, optname="previous"): params = job_params(jobid) return params.jobids.get(optname) or params.datasets.get(optname)