def analysis(sliceno, params, prepare_res): spilldata = {} stats = {} we_have_spill = False if datasets.previous: prev_spilldata = blob.load('spilldata', jobid=datasets.previous, sliceno=sliceno) for source, data in prev_spilldata: spilldata[source], stats[source] = process_one( sliceno, options, source, prepare_res, data) we_have_spill |= not stats[source].virtual_spill if datasets.source: prev_params = job_params(datasets.previous, default_empty=True) for source in datasets.source.chain( stop_ds=prev_params.datasets.source): spilldata[source], stats[source] = process_one( sliceno, options, source, prepare_res) we_have_spill |= not stats[source].virtual_spill spilldata = [(k, v) for k, v in spilldata.iteritems() if v] if we_have_spill: spilldata.append((params.jobid, empty_spilldata('SPILL'))) blob.save(spilldata, 'spilldata', sliceno=sliceno, temp=False) blob.save(stats, 'stats', sliceno=sliceno, temp=False) return we_have_spill
def analysis(sliceno, prepare_res): stats = {} prev_spilldata = blob.load('spilldata', jobid=datasets.source, sliceno=sliceno) source_params = job_params(datasets.source) for source, data in prev_spilldata: _, stats[source] = a_dataset_datesplit.process_one( sliceno, source_params.options, source, prepare_res, data, save_discard=True) source_params = job_params(datasets.source) prev_params = job_params(source_params.datasets.previous, default_empty=True) for source in Dataset(source_params.datasets.source).chain( stop_ds=prev_params.datasets.source): _, stats[source] = a_dataset_datesplit.process_one( sliceno, source_params.options, source, prepare_res, save_discard=True) blob.save(stats, 'stats', sliceno=sliceno, temp=False)
def _ds_load(obj): n = unicode(obj) if n not in _ds_cache: _ds_cache[n] = _v2_columntypefix( blob.load(obj._name('pickle'), obj.jobid)) _ds_cache.update(_ds_cache[n].get('cache', ())) return _ds_cache[n]
def real_synthesis(params, options, datasets, minmax_index, prepare_res, we_have_spill, save_discard=False): stats = DotDict( included_lines = [0] * params.slices, discarded_lines = [0] * params.slices, spilled_lines = [0] * params.slices, virtually_spilled_lines = [0] * params.slices, split_date = str(options.split_date) if options.split_date else None, discard_before_date = str(options.discard_before_date) if options.discard_before_date else None, ) minmax_per_slice = [{} for _ in range(params.slices)] def update_stats(data): for item in data.itervalues(): stats.included_lines[sliceno] += item.counters[2] stats.discarded_lines[sliceno] += item.counters[1] if item.virtual_spill: stats.virtually_spilled_lines[sliceno] += item.counters[3] else: stats.spilled_lines[sliceno] += item.counters[3] update_minmax(minmax_per_slice[sliceno], item.minmax) def update_minmax(dest, src): for name, lst0 in src.iteritems(): lst1 = dest.get(name, lst0) mins = map(min, zip(lst0[:3], lst1[:3])) maxs = map(max, zip(lst0[3:], lst1[3:])) dest[name] = mins + maxs for sliceno in range(params.slices): update_stats(blob.load('stats', sliceno=sliceno)) minmax = {} for item in minmax_per_slice: update_minmax(minmax, item) def minmax_select(offset, stringify=False): d = {} for k, v in minmax.iteritems(): mn = v[offset] mx = v[3 + offset] if mn <= mx: if stringify and isinstance(mn, (date, time,)): d[k] = [str(mn), str(mx)] else: d[k] = [mn, mx] return d dw, dw_spill = prepare_res[:2] dw.set_minmax(None, minmax_select(minmax_index)) dw_spill.set_minmax(None, minmax_select(2)) if save_discard: included_lines = stats.discarded_lines else: included_lines = stats.included_lines for sliceno in range(params.slices): dw.set_lines(sliceno, included_lines[sliceno]) dw_spill.set_lines(sliceno, stats.spilled_lines[sliceno]) if not we_have_spill: dw_spill.discard() stats.minmax_discarded = minmax_select(0, True) stats.minmax = minmax_select(1, True) stats.minmax_spilled = minmax_select(2, True) json_save(stats)
def one_slice(sliceno): first = True updater = globals()['upd_' + options.flavour] for pickle in options.pickles: tmp = load(pickle, sliceno=sliceno) if first: res = tmp first = False else: updater(res, tmp) save(res, options.resultname, sliceno=sliceno)
def analysis(sliceno, prepare_res): key_filter, value_filter = prepare_res d = blob.load(jobid=jobids.previous, sliceno=sliceno, default=defaultdict(set)) if options.key_filter: d = {k: v for k, v in d.iteritems() if k in key_filter} iterator = datasets.source.iterate_chain( sliceno, ( options.key_column, options.value_column, ), stop_jobid={jobids.previous: 'source'}, ) # These break out into four versions for shorter runtime if options.value_filter: # Remove anything that's not in the filter for k, v in d.items(): v = v & value_filter if v: d[k] = v else: del d[k] # This lets us reuse the same str object for the same value (smaller pickles) value_filter = {v: v for v in value_filter} if options.key_filter: for k, v in iterator: if k in key_filter and v in value_filter: d[k].add(value_filter[v]) else: for k, v in iterator: if v in value_filter: d[k].add(value_filter[v]) else: reuse = {} if options.key_filter: for k, v in iterator: if k in key_filter: d[k].add(reuse.setdefault(v, v)) else: for k, v in iterator: d[k].add(reuse.setdefault(v, v)) blob.save(d, sliceno=sliceno, temp=False) blob.save(set(d), 'keyset', sliceno=sliceno, temp=False) blob.save(Counter(len(v) for v in d.itervalues()), 'setsizehist', sliceno=sliceno, temp=False)
def synthesis(): sum = 0 jobs = datasets.source.chain(length=options.chain_length, stop_jobid=datasets.stop) for src in jobs: jid = build('dataset_checksum', options=dict(columns=options.columns, sort=options.sort), datasets=dict(source=src)) data = blob.load(jobid=jid) sum ^= data.sum print("Total: %016x" % (sum, )) return DotDict(sum=sum, columns=data.columns, sort=options.sort, sources=jobs)
def synthesis(params): setsizehist = Counter() for sliceno in range(params.slices): setsizehist.update(blob.load('setsizehist', sliceno=sliceno)) blob.save(setsizehist, 'setsizehist')
def prepare(jobids): key_filter = blob.load(options.key_filter, default=set()), value_filter = blob.load(options.value_filter, default=set()) return key_filter, value_filter
def main(urd): resetlocale() if False: # One BILLION rows # This takes about half an hour on a fast machine num_rows = int(1e7) num_datasets = 100 else: # One MILLION rows num_rows = int(1e6) num_datasets = 10 # Create datasets print("\x1b[1m(1) Create chain of datasets.\x1b[m") jid = None for _ in range(num_datasets): jid = urd.build('example_perf_gendata', options=dict(num_rows=num_rows), datasets=dict(previous=jid)) # Export chain of datasets to CSV-file. print("\x1b[1m(2) Export dataset chain to CSV file.\x1b[m") jid = urd.build('csvexport', datasets=dict(source=jid), options=dict(filename='out.csv.gz', chain_source=True)) filename = resolve_jobid_filename(jid, 'out.csv.gz') print('Exported file stored in \"%s\"' % (filename, )) # Import and type previously exported CSV-file. print("\x1b[1m(3) Import dataset from CVS file.\x1b[m") jid = urd.build('csvimport', options=dict(filename=filename)) opts = dict( column2type={ 'a string': 'ascii', 'large number': 'number', 'small number': 'number', 'small integer': 'int32_10', # you must specify base for integers 'gauss number': 'number', 'gauss float': 'float64', }, ) print("\x1b[1m(4) Type imported dataset.\x1b[m") jid = urd.build('dataset_type', datasets=dict(source=jid), options=opts) # Sum all values in a column. Repeat for a set of columns with different types. print("\x1b[1m(5) Run some methods on the typed dataset.\x1b[m") jid_single = jid source = jid_single for colname in ('small number', 'small integer', 'large number', 'gauss number', 'gauss float'): print(colname) jid = urd.build('example_perf_sum', datasets=dict(source=source), options=dict(colname=colname), name='sum ' + colname) jid = urd.build('example_perf_sum_positive', datasets=dict(source=source), options=dict(colname=colname), name='sum positive ' + colname) # Compute histograms of a column print('histogram') jid = urd.build('example_perf_histogram', datasets=dict(source=source), options=dict(colname='gauss number'), name='histogram_number') jid = urd.build('example_perf_histogram', datasets=dict(source=source), options=dict(colname='gauss float'), name='histogram_float') # Find string print('find string') jid = urd.build('example_perf_find_string', datasets=dict(source=source), options=dict(colname='a string', text='ExAx'), name='find_string') print( "Number of lines containing string \"%s\" is %d." % (job_params(jid).options['text'], blob.load(jobid=jid)), ) # Print resulting profiling information from automata_common import profile_jobs print() def pl(text, time): print("%-30s %10.3f %14s" % ( text, time, '{0:n}'.format(round(num_rows * num_datasets / time)), )) print() print('-' * 56) print("operation exec time rows/s") print() pl('csvexport', profile_jobs(urd.joblist.find('csvexport'))) print() pl( 'reimport total', profile_jobs( urd.joblist.find('csvimport') + urd.joblist.find('dataset_type'))) pl(" csvimport ", profile_jobs(urd.joblist.find('csvimport'))) pl(" type ", profile_jobs(urd.joblist.find('dataset_type'))) print() print("sum") pl(" small number ", profile_jobs(urd.joblist.find('sum small number'))) pl(" small integer ", profile_jobs(urd.joblist.find('sum small integer'))) pl(" large number ", profile_jobs(urd.joblist.find('sum large number'))) pl(" gauss number ", profile_jobs(urd.joblist.find('sum gauss number'))) pl(" gauss float ", profile_jobs(urd.joblist.find('sum gauss float'))) print() print("sum positive") pl(" small number ", profile_jobs(urd.joblist.find('sum positive small number'))) pl(" small integer ", profile_jobs(urd.joblist.find('sum positive small integer'))) pl(" large number ", profile_jobs(urd.joblist.find('sum positive large number'))) pl(" gauss number ", profile_jobs(urd.joblist.find('sum positive gauss number'))) pl(" gauss float ", profile_jobs(urd.joblist.find('sum positive gauss float'))) print() print("histogram") pl(" number ", profile_jobs(urd.joblist.find('histogram_number'))) pl(" float ", profile_jobs(urd.joblist.find('histogram_float'))) print() pl("find string ", profile_jobs(urd.joblist.find('find_string'))) print() print("Total test time %10.3f" % (profile_jobs(urd.joblist), )) print() print('Example size is %s lines.' % ('{0:n}'.format(num_datasets * num_rows), )) print('Number of slices is %d.' % (urd.info.slices, )) print('-' * 56)
def ck(jid, method="dataset_checksum", **kw): jid = subjobs.build(method, datasets=dict(source=jid), options=kw) return blob.load(jobid=jid).sum
def main(urd): # Example 1. Create a chain of datasets containing random data. jid_prev = None for n in range(5): jid_ds = urd.build( 'example1_create_dataset', datasets=dict(previous=jid_prev), options=dict(approx_rows=100000, seed=n), name='Created_number_%s' % (n, ), ) jid_prev = jid_ds # Example 2. Export the last dataset in the chain to a tab # separated textfile. jid_exp = urd.build( 'csvexport', datasets=dict(source=jid_ds), options=dict(filename='random.tsv', separator='\t'), ) filename = resolve_jobid_filename(jid_exp, 'random.tsv') print('Exported file stored in \"%s\"' % (filename, )) # Example 3. Import the tab separated textfile and type it jid_imp = urd.build( 'csvimport', options=dict(filename=filename, separator='\t', labelsonfirstline=True), ) jid_typ = urd.build( 'dataset_type', datasets=dict(source=jid_imp), options=dict(column2type=dict(rflt='number', rint='number')), ) # Example 4. Run a method computing the average of a column, in a # loop, one column at a time. The column name is an # input parameter. for column in Dataset(jid_typ).columns: jid_avg = urd.build( 'example1_calc_average', datasets=dict(source=jid_typ), options=dict(column=column), ) (s, n) = blob.load(jobid=jid_avg) print("Column %s: sum=%f, length=%d, average=%f" % (column, s, n, s / n)) # Example 5. Create a new column that is the product of two # existing columns. jid_add = urd.build( 'example1_add_column', datasets=dict(source=jid_typ), ) # Example 6. Export a dataset with named columns in specified # order. jid_add_exp = urd.build( 'csvexport', datasets=dict(source=jid_add), options=dict(filename='prod.csv', labels=( 'prod', 'rflt', 'rint', )), ) print(urd.joblist.pretty)