Exemple #1
0
def test(name, input, want_obj, want_bytes, **kw):
    json_save(input, name, **kw)
    with open(name, "rb") as fh:
        got_bytes_raw = fh.read()
        assert got_bytes_raw[
            -1:] == b"\n", name + " didn't even end with a newline"
        got_bytes_raw = got_bytes_raw[:-1]
    as_str = json_encode(input, as_str=True, **kw)
    as_bytes = json_encode(input, as_str=False, **kw)
    assert isinstance(as_str, str) and isinstance(
        as_bytes, bytes), "json_encode returns the wrong types: %s %s" % (
            type(as_str),
            type(as_bytes),
        )
    assert as_bytes == got_bytes_raw, "json_save doesn't save the same thing json_encode returns for " + name
    if PY3:
        as_str = as_str.encode("utf-8")
    assert as_bytes == as_str, "json_encode doesn't return the same data for as_str=True and False"
    got_obj = json_load(name)
    assert want_obj == got_obj, "%s roundtrips wrong (wanted %r, got %r)" % (
        name, want_obj, got_obj)
    with open(name, "rb") as fh:
        got_bytes_fuzzy = b"".join(line.strip() for line in fh)
    assert want_bytes == got_bytes_fuzzy, "%s wrong on disk (but decoded right)" % (
        name, )
Exemple #2
0
	def run_job(self, jobid, subjob_cookie=None, parent_pid=0):
		W = self.workspaces[get_workspace_name(jobid)]
		#
		active_workspaces = {}
		for name in self.source_workdirs:
			active_workspaces[name] = self.workspaces[name].get_path()
		slices = self.workspaces[self.target_workdir].get_slices()

		t0 = time.time()
		setup = update_setup(jobid, starttime=t0)
		prof = setup.profile or DotDict()
		new_prof, files, subjobs = dispatch.launch(W.path, setup, self.config, self.Methods, active_workspaces, slices, self.debug, self.daemon_url, subjob_cookie, parent_pid)
		if self.debug:
			delete_from = Temp.TEMP
		else:
			delete_from = Temp.DEBUG
		for filename, temp in list(files.items()):
			if temp >= delete_from:
				unlink(join(W.path, jobid, filename))
				del files[filename]
		prof.update(new_prof)
		prof.total = 0
		prof.total = sum(v for v in prof.values() if isinstance(v, (float, int)))
		data = dict(
			starttime=t0,
			endtime=time.time(),
			profile=prof,
		)
		update_setup(jobid, **data)
		data['files'] = files
		data['subjobs'] = subjobs
		json_save(data, resolve_jobid_filename(jobid, 'post.json'))
Exemple #3
0
def real_synthesis(params, options, datasets, minmax_index, prepare_res, we_have_spill, save_discard=False):
	stats = DotDict(
		included_lines          = [0] * params.slices,
		discarded_lines         = [0] * params.slices,
		spilled_lines           = [0] * params.slices,
		virtually_spilled_lines = [0] * params.slices,
		split_date              = str(options.split_date) if options.split_date else None,
		discard_before_date     = str(options.discard_before_date) if options.discard_before_date else None,
	)
	minmax_per_slice = [{} for _ in range(params.slices)]
	def update_stats(data):
		for item in data.itervalues():
			stats.included_lines[sliceno] += item.counters[2]
			stats.discarded_lines[sliceno] += item.counters[1]
			if item.virtual_spill:
				stats.virtually_spilled_lines[sliceno] += item.counters[3]
			else:
				stats.spilled_lines[sliceno] += item.counters[3]
			update_minmax(minmax_per_slice[sliceno], item.minmax)
	def update_minmax(dest, src):
		for name, lst0 in src.iteritems():
			lst1 = dest.get(name, lst0)
			mins = map(min, zip(lst0[:3], lst1[:3]))
			maxs = map(max, zip(lst0[3:], lst1[3:]))
			dest[name] = mins + maxs
	for sliceno in range(params.slices):
		update_stats(blob.load('stats', sliceno=sliceno))
	minmax = {}
	for item in minmax_per_slice:
		update_minmax(minmax, item)
	def minmax_select(offset, stringify=False):
		d = {}
		for k, v in minmax.iteritems():
			mn = v[offset]
			mx = v[3 + offset]
			if mn <= mx:
				if stringify and isinstance(mn, (date, time,)):
					d[k] = [str(mn), str(mx)]
				else:
					d[k] = [mn, mx]
		return d
	dw, dw_spill = prepare_res[:2]
	dw.set_minmax(None, minmax_select(minmax_index))
	dw_spill.set_minmax(None, minmax_select(2))
	if save_discard:
		included_lines = stats.discarded_lines
	else:
		included_lines = stats.included_lines
	for sliceno in range(params.slices):
		dw.set_lines(sliceno, included_lines[sliceno])
		dw_spill.set_lines(sliceno, stats.spilled_lines[sliceno])
	if not we_have_spill:
		dw_spill.discard()
	stats.minmax_discarded = minmax_select(0, True)
	stats.minmax           = minmax_select(1, True)
	stats.minmax_spilled   = minmax_select(2, True)
	json_save(stats)
Exemple #4
0
def synthesis(params, prepare_res):
	source_params = job_params(datasets.source)
	source_params.options.caption = options.caption
	a_dataset_datesplit.real_synthesis(params, source_params.options, source_params.datasets, 0, prepare_res, False, save_discard=True)
	stats = json_load()
	json_save(dict(
		minmax              = stats.minmax_discarded,
		included_lines      = stats.discarded_lines,
		split_date          = stats.split_date,
		discard_before_date = stats.discard_before_date,
	))
def synthesis(params, analysis_res, prepare_res):
    r = report()
    res = DotDict()
    d = datasets.source
    analysis_res = list(analysis_res)
    if options.filter_bad:
        num_lines_per_split = [
            num - data[1] for num, data in zip(d.lines, analysis_res)
        ]
        res.bad_line_count_per_slice = [data[1] for data in analysis_res]
        res.bad_line_count_total = sum(res.bad_line_count_per_slice)
        r.println('Slice   Bad line count')
        for sliceno, cnt in enumerate(res.bad_line_count_per_slice):
            r.println('%5d   %d' % (
                sliceno,
                cnt,
            ))
        r.println('total   %d' % (res.bad_line_count_total, ))
        r.line()
        r.println('Slice   Bad line number')
        reported_count = 0
        for sliceno, data in enumerate(analysis_res):
            fn = 'badmap%d' % (sliceno, )
            if data[1] and reported_count < 32:
                with open(fn, 'rb') as fh:
                    badmap = mmap(fh.fileno(), 0, prot=PROT_READ)
                    for ix, v in enumerate(imap(ord, badmap)):
                        if v:
                            for jx in range(8):
                                if v & (1 << jx):
                                    r.println('%5d   %d' % (
                                        sliceno,
                                        ix * 8 + jx,
                                    ))
                                    reported_count += 1
                                    if reported_count >= 32: break
                            if reported_count >= 32: break
                    badmap.close()
            unlink(fn)
        if reported_count >= 32:
            r.println('...')
        r.line()
        res.bad_line_count_per_column = {}
        r.println('Bad line count   Column')
        for colname in sorted(analysis_res[0][0]):
            cnt = sum(data[0][colname] for data in analysis_res)
            r.println('%14d   %s' % (
                cnt,
                colname,
            ))
            res.bad_line_count_per_column[colname] = cnt
        r.line()
    else:
        num_lines_per_split = d.lines
    dw = prepare_res
    for sliceno, count in enumerate(num_lines_per_split):
        dw.set_lines(sliceno, count)
    if options.defaults:
        r.println('Defaulted values')
        res.defaulted_per_slice = {}
        res.defaulted_total = {}
        for colname in sorted(options.defaults):
            r.println('    %s:' % (colname, ))
            r.println('        Slice   Defaulted line count')
            res.defaulted_per_slice[colname] = [
                data[2][colname] for data in analysis_res
            ]
            res.defaulted_total[colname] = sum(
                res.defaulted_per_slice[colname])
            for sliceno, cnt in enumerate(res.defaulted_per_slice[colname]):
                r.println('        %5d   %d' % (
                    sliceno,
                    cnt,
                ))
            r.println('        total   %d' % (res.defaulted_total[colname], ))
        r.line()
    for sliceno, data in enumerate(analysis_res):
        dw.set_minmax(sliceno, data[3])
    d = dw.finish()
    res.good_line_count_per_slice = num_lines_per_split
    res.good_line_count_total = sum(num_lines_per_split)
    r.line()
    r.println('Total of %d lines converted' % (res.good_line_count_total, ))
    r.close()
    json_save(res)