def real_synthesis(params, options, datasets, minmax_index, prepare_res, we_have_spill, save_discard=False): stats = DotDict( included_lines = [0] * params.slices, discarded_lines = [0] * params.slices, spilled_lines = [0] * params.slices, virtually_spilled_lines = [0] * params.slices, split_date = str(options.split_date) if options.split_date else None, discard_before_date = str(options.discard_before_date) if options.discard_before_date else None, ) minmax_per_slice = [{} for _ in range(params.slices)] def update_stats(data): for item in data.itervalues(): stats.included_lines[sliceno] += item.counters[2] stats.discarded_lines[sliceno] += item.counters[1] if item.virtual_spill: stats.virtually_spilled_lines[sliceno] += item.counters[3] else: stats.spilled_lines[sliceno] += item.counters[3] update_minmax(minmax_per_slice[sliceno], item.minmax) def update_minmax(dest, src): for name, lst0 in src.iteritems(): lst1 = dest.get(name, lst0) mins = map(min, zip(lst0[:3], lst1[:3])) maxs = map(max, zip(lst0[3:], lst1[3:])) dest[name] = mins + maxs for sliceno in range(params.slices): update_stats(blob.load('stats', sliceno=sliceno)) minmax = {} for item in minmax_per_slice: update_minmax(minmax, item) def minmax_select(offset, stringify=False): d = {} for k, v in minmax.iteritems(): mn = v[offset] mx = v[3 + offset] if mn <= mx: if stringify and isinstance(mn, (date, time,)): d[k] = [str(mn), str(mx)] else: d[k] = [mn, mx] return d dw, dw_spill = prepare_res[:2] dw.set_minmax(None, minmax_select(minmax_index)) dw_spill.set_minmax(None, minmax_select(2)) if save_discard: included_lines = stats.discarded_lines else: included_lines = stats.included_lines for sliceno in range(params.slices): dw.set_lines(sliceno, included_lines[sliceno]) dw_spill.set_lines(sliceno, stats.spilled_lines[sliceno]) if not we_have_spill: dw_spill.discard() stats.minmax_discarded = minmax_select(0, True) stats.minmax = minmax_select(1, True) stats.minmax_spilled = minmax_select(2, True) json_save(stats)
def process_one(sliceno, options, source, prepare_res, data=None, save_discard=False): # Future improvement: Look at the old minmax to determine if we will get anything from reading this data dw, dw_spill, column_names, column_sizes, column_types, minmax_typeidx = prepare_res if data: assert data.version == 1 data.seen_before = True else: data = empty_spilldata() d = Dataset(source, data.spill_ds) in_files = [] out_files = [] offsets = [] if not save_discard: out_files += [ffi.NULL] * len(column_names) # don't save "too old" lines minmax_files = [] minmax_d = {} for colname in column_names: out_fn = dw.column_filename(colname, sliceno).encode('ascii') in_fn = d.column_filename(colname, sliceno).encode('ascii') offset = d.columns[colname].offsets[sliceno] if d.columns[colname].offsets else 0 in_files.append(ffi.new('char []', in_fn)) out_files.append(ffi.new('char []', out_fn)) offsets.append(offset) minmax_fn = out_fn + '_minmax' minmax_files.append(ffi.new('char []', minmax_fn)) minmax_d[colname] = minmax_fn if save_discard: out_files += [ffi.NULL] * len(column_names) # don't save "good" lines (save discard instead) date_coltype = column_types[options.date_column] def date2cfmt(dt): if date_coltype == 'datetime': date0 = (dt.year << 14) | (dt.month << 10) | (dt.day << 5) | dt.hour date1 = (dt.minute << 26) | (dt.second << 20) | dt.microsecond elif date_coltype == 'date': date0 = (dt.year << 9) | (dt.month << 5) | dt.day date1 = 0 elif date_coltype == 'time': date0 = 32277536 | dt.hour date1 = (dt.minute << 26) | (dt.second << 20) | dt.microsecond else: raise Exception('Bad date_coltype type: ' + date_coltype) return date0, date1 dates = [0, 0, 0, 0, 0xffffffff, 0xffffffff] stats = DotDict() if data.seen_before: dates[0:2] = date2cfmt(data.get('process_date', datetime.min)) if (data.last_time or options.hard_spill) and not save_discard: for colname in column_names: out_fn = dw_spill.column_filename(colname, sliceno).encode('ascii') out_files.append(ffi.new('char []', out_fn)) stats.virtual_spill = False else: # We still have to make sure the files exist, or we end up # with a broken dataset if only some slices wanted to spill. for colname in column_names: open(dw_spill.column_filename(colname, sliceno), 'ab').close() out_files += [ffi.NULL] * len(column_names) stats.virtual_spill = True # We are done reading `data` - update it for next iteration del data.seen_before data.process_date = datetime.min if options.discard_before_date: if options.split_date: assert options.discard_before_date < options.split_date dates[2:3] = date2cfmt(options.discard_before_date) data.process_date = options.discard_before_date if options.split_date: dates[4:6] = date2cfmt(options.split_date) data.process_date = max(data.process_date, options.split_date) counters = ffi.new('uint64_t [4]') # one for each class-enum res = backend.filter(len(in_files), in_files, offsets, out_files, minmax_files, column_sizes, counters, dates, minmax_typeidx, d.lines[sliceno]) assert not res, "cffi converter returned error on data from " + source stats.version = 0 stats.counters = list(counters) stats.minmax = {} for colname, fn in minmax_d.iteritems(): if exists(fn): with type2iter[column_types[colname]](fn) as it: stats.minmax[colname] = list(it) unlink(fn) # If there is at most 2% left, spill it next time. # Or if there is at most 10% left and we have read it at least 8 times. # Or if there is at most 20% left and we have read it at least 16 times. # A reasonable balance between re-reading and re-writing, one hopes. data.counter += 1 total_lines = sum(counters) data.last_time = (counters[3] <= total_lines / 50 or (data.counter >= 8 and counters[3] <= total_lines / 10) or (data.counter >= 16 and counters[3] <= total_lines / 5) ) # If no lines were spilled we will not need this dataset again, # nor if we wrote the spill in this dataset. if not counters[3] or not stats.virtual_spill: data = None return data, stats