Python DotDict.minmax Examples

Programming Language: Python

Namespace/Package Name: extras

Class/Type: DotDict

Method/Function: minmax

Examples at hotexamples.com: 2

Python DotDict.minmax - 2 examples found. These are the top rated real world Python examples of extras.DotDict.minmax extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DotDict(24)

minmax(2)

idle(1)

status_stacks(1)

stack(1)

parent_pid(1)

options(1)

minmax_spilled(1)

minmax_discarded(1)

last_time(1)

last_error(1)

last(1)

inside_filenames(1)

good_line_count_total(1)

bad_line_count_per_column(1)

good_line_count_per_slice(1)

flags(1)

filename(1)

defaulted_total(1)

defaulted_per_slice(1)

counters(1)

columns(1)

children(1)

cache(1)

bad_line_count_total(1)

bad_line_count_per_slice(1)

summary(1)

Example #1

Show file

def real_synthesis(params, options, datasets, minmax_index, prepare_res, we_have_spill, save_discard=False):
	stats = DotDict(
		included_lines          = [0] * params.slices,
		discarded_lines         = [0] * params.slices,
		spilled_lines           = [0] * params.slices,
		virtually_spilled_lines = [0] * params.slices,
		split_date              = str(options.split_date) if options.split_date else None,
		discard_before_date     = str(options.discard_before_date) if options.discard_before_date else None,
	)
	minmax_per_slice = [{} for _ in range(params.slices)]
	def update_stats(data):
		for item in data.itervalues():
			stats.included_lines[sliceno] += item.counters[2]
			stats.discarded_lines[sliceno] += item.counters[1]
			if item.virtual_spill:
				stats.virtually_spilled_lines[sliceno] += item.counters[3]
			else:
				stats.spilled_lines[sliceno] += item.counters[3]
			update_minmax(minmax_per_slice[sliceno], item.minmax)
	def update_minmax(dest, src):
		for name, lst0 in src.iteritems():
			lst1 = dest.get(name, lst0)
			mins = map(min, zip(lst0[:3], lst1[:3]))
			maxs = map(max, zip(lst0[3:], lst1[3:]))
			dest[name] = mins + maxs
	for sliceno in range(params.slices):
		update_stats(blob.load('stats', sliceno=sliceno))
	minmax = {}
	for item in minmax_per_slice:
		update_minmax(minmax, item)
	def minmax_select(offset, stringify=False):
		d = {}
		for k, v in minmax.iteritems():
			mn = v[offset]
			mx = v[3 + offset]
			if mn <= mx:
				if stringify and isinstance(mn, (date, time,)):
					d[k] = [str(mn), str(mx)]
				else:
					d[k] = [mn, mx]
		return d
	dw, dw_spill = prepare_res[:2]
	dw.set_minmax(None, minmax_select(minmax_index))
	dw_spill.set_minmax(None, minmax_select(2))
	if save_discard:
		included_lines = stats.discarded_lines
	else:
		included_lines = stats.included_lines
	for sliceno in range(params.slices):
		dw.set_lines(sliceno, included_lines[sliceno])
		dw_spill.set_lines(sliceno, stats.spilled_lines[sliceno])
	if not we_have_spill:
		dw_spill.discard()
	stats.minmax_discarded = minmax_select(0, True)
	stats.minmax           = minmax_select(1, True)
	stats.minmax_spilled   = minmax_select(2, True)
	json_save(stats)

Example #2

Show file

def process_one(sliceno, options, source, prepare_res, data=None, save_discard=False):
	# Future improvement: Look at the old minmax to determine if we will get anything from reading this data
	dw, dw_spill, column_names, column_sizes, column_types, minmax_typeidx = prepare_res
	if data:
		assert data.version == 1
		data.seen_before = True
	else:
		data = empty_spilldata()
	d = Dataset(source, data.spill_ds)
	in_files = []
	out_files = []
	offsets = []
	if not save_discard:
		out_files += [ffi.NULL] * len(column_names) # don't save "too old" lines
	minmax_files = []
	minmax_d = {}
	for colname in column_names:
		out_fn = dw.column_filename(colname, sliceno).encode('ascii')
		in_fn = d.column_filename(colname, sliceno).encode('ascii')
		offset = d.columns[colname].offsets[sliceno] if d.columns[colname].offsets else 0
		in_files.append(ffi.new('char []', in_fn))
		out_files.append(ffi.new('char []', out_fn))
		offsets.append(offset)
		minmax_fn = out_fn + '_minmax'
		minmax_files.append(ffi.new('char []', minmax_fn))
		minmax_d[colname] = minmax_fn
	if save_discard:
		out_files += [ffi.NULL] * len(column_names) # don't save "good" lines (save discard instead)
	date_coltype = column_types[options.date_column]
	def date2cfmt(dt):
		if date_coltype == 'datetime':
			date0 = (dt.year << 14) | (dt.month << 10) | (dt.day << 5) | dt.hour
			date1 = (dt.minute << 26) | (dt.second << 20) | dt.microsecond
		elif date_coltype == 'date':
			date0 = (dt.year << 9) | (dt.month << 5) | dt.day
			date1 = 0
		elif date_coltype == 'time':
			date0 = 32277536 | dt.hour
			date1 = (dt.minute << 26) | (dt.second << 20) | dt.microsecond
		else:
			raise Exception('Bad date_coltype type: ' + date_coltype)
		return date0, date1
	dates = [0, 0, 0, 0, 0xffffffff, 0xffffffff]
	stats = DotDict()
	if data.seen_before:
		dates[0:2] = date2cfmt(data.get('process_date', datetime.min))
	if (data.last_time or options.hard_spill) and not save_discard:
		for colname in column_names:
			out_fn = dw_spill.column_filename(colname, sliceno).encode('ascii')
			out_files.append(ffi.new('char []', out_fn))
		stats.virtual_spill = False
	else:
		# We still have to make sure the files exist, or we end up
		# with a broken dataset if only some slices wanted to spill.
		for colname in column_names:
			open(dw_spill.column_filename(colname, sliceno), 'ab').close()
		out_files += [ffi.NULL] * len(column_names)
		stats.virtual_spill = True
	# We are done reading `data` - update it for next iteration
	del data.seen_before
	data.process_date = datetime.min
	if options.discard_before_date:
		if options.split_date:
			assert options.discard_before_date < options.split_date
		dates[2:3] = date2cfmt(options.discard_before_date)
		data.process_date = options.discard_before_date
	if options.split_date:
		dates[4:6] = date2cfmt(options.split_date)
		data.process_date = max(data.process_date, options.split_date)
	counters = ffi.new('uint64_t [4]') # one for each class-enum
	res = backend.filter(len(in_files), in_files, offsets, out_files, minmax_files, column_sizes, counters, dates, minmax_typeidx, d.lines[sliceno])
	assert not res, "cffi converter returned error on data from " + source
	stats.version = 0
	stats.counters = list(counters)
	stats.minmax = {}
	for colname, fn in minmax_d.iteritems():
		if exists(fn):
			with type2iter[column_types[colname]](fn) as it:
				stats.minmax[colname] = list(it)
			unlink(fn)
	# If there is at most 2% left, spill it next time.
	# Or if there is at most 10% left and we have read it at least 8 times.
	# Or if there is at most 20% left and we have read it at least 16 times.
	# A reasonable balance between re-reading and re-writing, one hopes.
	data.counter += 1
	total_lines = sum(counters)
	data.last_time = (counters[3] <= total_lines / 50 or
		(data.counter >= 8 and counters[3] <= total_lines / 10) or
		(data.counter >= 16 and counters[3] <= total_lines / 5)
	)
	# If no lines were spilled we will not need this dataset again,
	# nor if we wrote the spill in this dataset.
	if not counters[3] or not stats.virtual_spill:
		data = None
	return data, stats