def mkds(name, columns, data, **kw):
    columns = dict.fromkeys(columns, 'int32')
    dw = DatasetWriter(name=name, columns=columns, **kw)
    write = dw.get_split_write()
    for v in data:
        write(*v)
    return dw.finish()
Example #2
0
def prepare(params):
    d = datasets.source
    caption = options.caption % dict(caption=d.caption,
                                     hashlabel=options.hashlabel)
    if len(
            d.chain(stop_ds={datasets.previous: 'source'},
                    length=options.length)) == 1:
        filename = d.filename
    else:
        filename = None
    dws = []
    previous = datasets.previous
    for sliceno in range(params.slices):
        if options.as_chain and sliceno == params.slices - 1:
            name = "default"
        else:
            name = str(sliceno)
        dw = DatasetWriter(
            caption="%s (slice %d)" % (caption, sliceno),
            hashlabel=options.hashlabel,
            filename=filename,
            previous=previous,
            name=name,
            for_single_slice=sliceno,
        )
        previous = (params.jobid, name)
        dws.append(dw)
    names = []
    for n, c in d.columns.items():
        # names has to be in the same order as the add calls
        # so the iterator returns the same order the writer expects.
        names.append(n)
        for dw in dws:
            dw.add(n, c.type)
    return dws, names, caption, filename
Example #3
0
def prepare():
    columns = dict(
        bytes="bytes",
        float="float64",
        int="int64",
        json="json",
        unicode="unicode",
    )
    a = DatasetWriter(name="a", columns=columns)
    b = DatasetWriter(name="b", columns=columns, previous=a)
    c = DatasetWriter(name="c", columns=columns)
    return a, b, c
def test_filter_bad_across_types():
	columns={
		'bytes': 'bytes',
		'float64': 'bytes',
		'int32_10': 'ascii',
		'json': 'unicode',
		'number:int': 'unicode',
		'unicode:utf-8': 'bytes',
	}
	# all_good, *values
	# Make sure all those types (except bytes) can filter other lines,
	# and be filtered by other lines. And that several filtering values
	# is not a problem (line 11).
	data = [
		(True,  b'first',    b'1.1', '1',  '"a"',   '001', b'ett',),
		(True,  b'second',   b'2.2', '2',  '"b"',   '02',  b'tv\xc3\xa5',),
		(True,  b'third',    b'3.3', '3',  '["c"]', '3.0', b'tre',),
		(False, b'fourth',   b'4.4', '4',  '"d"',   '4.4', b'fyra',),       # number:int bad
		(False, b'fifth',    b'5.5', '-',  '"e"',   '5',   b'fem',),        # int32_10 bad
		(False, b'sixth',    b'6.b', '6',  '"f"',   '6',   b'sex',),        # float64 bad
		[False, b'seventh',  b'7.7', '7',  '{"g"}', '7',   b'sju',],        # json bad
		(False, b'eigth',    b'8.8', '8',  '"h"',   '8',   b'\xa5\xc3tta',),# unicode:utf-8 bad
		(True,  b'ninth',    b'9.9', '9',  '"i"',   '9',   b'nio',),
		(True,  b'tenth',    b'10',  '10', '"j"',   '10',  b'tio',),
		(False, b'eleventh', b'11a', '1-', '"k",',  '1,',  b'elva',),       # float64, int32_10 and number:int bad
		(True,  b'twelfth',  b'12',  '12', '"l"',   '12',  b'tolv',),
	]
	dw = DatasetWriter(name="filter bad across types", columns=columns)
	dw.set_slice(0)
	want = []
	def add_want(v):
		want.append((int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'),))
	for v in data:
		if v[0]:
			add_want(v)
		dw.write(*v[1:])
	for sliceno in range(1, g.slices):
		dw.set_slice(sliceno)
	source_ds = dw.finish()
	# Once with just filter_bad, once with some defaults too.
	defaults = {}
	for _ in range(2):
		jid = subjobs.build(
			'dataset_type',
			datasets=dict(source=source_ds),
			options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults),
		)
		typed_ds = Dataset(jid)
		got = list(typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8']))
		assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (want, got, typed_ds, source_ds, ' with defaults' if defaults else '')
		# make more lines "ok" for the second lap
		defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
		add_want(data[3])
		add_want(data[5])
		data[6][4] = '"replacement"'
		add_want(data[6])
		want.sort() # adding them out of order, int32_10 sorts correctly.
Example #5
0
def make_source(names):
    names = sorted(names)
    dsname = '+'.join(names)
    if dsname not in sources:
        dw = DatasetWriter(name=dsname, columns={'v': 'ascii'})
        write = dw.get_split_write()
        for name in names:
            for value in data[name][0]:
                write(value)
        sources[dsname] = (
            dw.finish(),
            min(unnan(data[name][1] for name in names)),
            max(unnan(data[name][2] for name in names)),
        )
    return sources[dsname]
Example #6
0
 def datasetwriter(self,
                   columns={},
                   filename=None,
                   hashlabel=None,
                   hashlabel_override=False,
                   caption=None,
                   previous=None,
                   name='default',
                   parent=None,
                   meta_only=False,
                   for_single_slice=None,
                   copy_mode=False,
                   allow_missing_slices=False):
     from accelerator.dataset import DatasetWriter
     return DatasetWriter(columns=columns,
                          filename=filename,
                          hashlabel=hashlabel,
                          hashlabel_override=hashlabel_override,
                          caption=caption,
                          previous=previous,
                          name=name,
                          parent=parent,
                          meta_only=meta_only,
                          for_single_slice=for_single_slice,
                          copy_mode=copy_mode,
                          allow_missing_slices=allow_missing_slices)
def analysis(sliceno, prepare_res, job):
    dw_default = DatasetWriter()
    dw_named = job.datasetwriter(name='named')
    dw_passed, _ = prepare_res
    for name, dw in [('default', dw_default), ('named', dw_named),
                     ('passed', dw_passed)]:
        for data in jobs.source.dataset(name).iterate(sliceno, copy_mode=True):
            dw.write(*data)
Example #8
0
def synthesis(prepare_res, params):
    if not options.as_chain:
        # If we don't want a chain we abuse our knowledge of dataset internals
        # to avoid recompressing. Don't do this stuff yourself.
        dws, names, caption, filename = prepare_res
        merged_dw = DatasetWriter(
            caption=caption,
            hashlabel=options.hashlabel,
            filename=filename,
            previous=datasets.previous,
            meta_only=True,
            columns=datasets.source.columns,
        )
        for sliceno in range(params.slices):
            merged_dw.set_lines(sliceno, sum(dw._lens[sliceno] for dw in dws))
            for dwno, dw in enumerate(dws):
                merged_dw.set_minmax((sliceno, dwno), dw._minmax[sliceno])
            for n in names:
                fn = merged_dw.column_filename(n, sliceno=sliceno)
                with open(fn, "wb") as out_fh:
                    for dw in dws:
                        fn = dw.column_filename(n, sliceno=sliceno)
                        with open(fn, "rb") as in_fh:
                            copyfileobj(in_fh, out_fh)
        for dw in dws:
            dw.discard()
def prepare(job, slices):
    assert slices >= test_data.value_cnt
    dw_default = DatasetWriter()
    dw_default.add("a", "number")
    dw_default.add("b", "ascii")
    DatasetWriter(name="named", columns={"c": "bool", "d": "date"})
    dw_passed = job.datasetwriter(name="passed", columns=test_data.columns)
    return dw_passed, 42
Example #10
0
def prepare():
    columns = dict(
        ascii="ascii",
        bytes="bytes",
        bytes_none=("bytes", True),
        float="float64",
        int="int64",
        json="json",
        unicode="unicode",
        unicode_none=("unicode", True),
    )
    if PY3:
        # z so it sorts last
        columns['zpickle'] = 'pickle'
        for ix, v in enumerate(test_data):
            test_data[ix] = v + ([ix, 'line %d' % (ix, ), {'line': ix}, 42], )
        test_data[-1][-1][-1] = float('-inf')
    a = DatasetWriter(name="a", columns=columns)
    b = DatasetWriter(name="b", columns=columns, previous=a)
    c = DatasetWriter(name="c", columns=columns)
    return a, b, c, test_data
Example #11
0
def prepare(params):
    assert params.slices >= 2, "Hashing won't do anything with just one slice"
    dws = DotDict()
    # all the numeric types should hash the same (for values they have in common)
    for name, hashlabel, typ in (
        ("unhashed_manual", None, "int32"),  # manually interlaved
        ("unhashed_split", None, "int64"),  # split_write interlaved
        ("up_checked", "up", "float32"),  # hashed on up using dw.hashcheck
        ("up_split", "up", "float64"),  # hashed on up using split_write
        ("down_checked", "down",
         "bits32"),  # hashed on down using dw.hashcheck
        ("down_discarded", "down",
         "bits64"),  # hashed on down using discarding writes
        ("down_discarded_list", "down",
         "number"),  # hashed on down using discarding list writes
        ("down_discarded_dict", "down",
         "complex32"),  # hashed on down using discarding dict writes
            # we have too many types, so we need more datasets
        ("unhashed_complex64", None, "complex64"),
        ("unhashed_bytes", None, "bytes"),
        ("up_ascii", "up", "ascii"),
        ("down_unicode", "down", "unicode"),
            # datetime on 1970-01-01 hashes like time
        ("up_datetime", "up", "datetime"),
        ("down_time", "down", "time"),
            # date doesn't hash the same as anything else, so compare it to itself
        ("up_date", "up", "date"),
        ("down_date", "down", "date"),
    ):
        dw = DatasetWriter(name=name, hashlabel=hashlabel)
        dw.add("up", typ)
        dw.add("down", typ)
        dws[name] = dw
    return dws
def test_column_discarding():
	dw = DatasetWriter(name='column discarding')
	dw.add('a', 'bytes')
	dw.add('b', 'bytes')
	dw.add('c', 'bytes')
	w = dw.get_split_write()
	w(b'a', b'b', b'c')
	source = dw.finish()

	# Discard b because it's not typed
	ac_implicit = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', c='ascii'),
		discard_untyped=True,
	).dataset()
	assert sorted(ac_implicit.columns) == ['a', 'c'], '%s: %r' % (ac_implicit, sorted(ac_implicit.columns),)
	assert list(ac_implicit.iterate(None)) == [('a', 'c',)], ac_implicit

	# Discard b explicitly
	ac_explicit = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', c='ascii'),
		rename=dict(b=None),
	).dataset()
	assert sorted(ac_explicit.columns) == ['a', 'c'], '%s: %r' % (ac_explicit, sorted(ac_explicit.columns),)
	assert list(ac_explicit.iterate(None)) == [('a', 'c',)], ac_explicit

	# Discard c by overwriting it with b. Keep untyped b.
	ac_bASc = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', c='ascii'),
		rename=dict(b='c'),
	).dataset()
	assert sorted(ac_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (ac_bASc, sorted(ac_bASc.columns),)
	assert list(ac_bASc.iterate(None)) == [('a', b'b', 'b',)], ac_bASc

	# Discard c by overwriting it with b. Also type b as a different type.
	abc_bASc = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', b='strbool', c='ascii'),
		rename=dict(b='c'),
	).dataset()
	assert sorted(abc_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (abc_bASc, sorted(abc_bASc.columns),)
	assert list(abc_bASc.iterate(None)) == [('a', True, 'b',)], abc_bASc
def prepare(job):
    job.datasetwriter(columns=jobs.source.dataset().columns, copy_mode=True)
    DatasetWriter(name='named',
                  columns=jobs.source.dataset('named').columns,
                  copy_mode=True)
    dw_passed = job.datasetwriter(name='passed', copy_mode=True)
    # DatasetColumn in .add
    for n, c in sorted(jobs.source.dataset('passed').columns.items()):
        dw_passed.add(n, c)
    # verify that .add(none_support=) takes precedence over coltype
    dw_nonetest_removed = job.datasetwriter(name='nonetest_removed')
    for n, c in sorted(jobs.source.dataset('nonetest').columns.items()):
        dw_nonetest_removed.add(n, c, none_support=(n == 'unicode'))
    return dw_passed, dw_nonetest_removed
Example #14
0
def prepare(params):
    dws = {}
    prev = None
    for name in "abcdefgh":
        dw = DatasetWriter(name=name, previous=prev)
        dw.add("ds", "ascii")
        dw.add("num", "number")
        dws[name] = dw
        prev = dw
    return dws
Example #15
0
def prepare(params):
    d = datasets.source
    ds_list = d.chain(stop_ds={datasets.previous: 'source'})
    if options.sort_across_slices:
        columniter = partial(Dataset.iterate_list, None, datasets=ds_list)
        sort_idx = sort(columniter)
        total = len(sort_idx)
        per_slice = [total // params.slices] * params.slices
        extra = total % params.slices
        if extra:
            # spread the left over length over pseudo-randomly selected slices
            # (using the start of sort_idx to select slices).
            # this will always select the first slices if data is already sorted
            # but at least it's deterministic.
            selector = sorted(range(min(params.slices, total)),
                              key=sort_idx.__getitem__)
            for sliceno in selector[:extra]:
                per_slice[sliceno] += 1
        # change per_slice to be the actual sort indexes
        start = 0
        for ix, num in enumerate(per_slice):
            end = start + num
            per_slice[ix] = sort_idx[start:end]
            start = end
        assert sum(len(part) for part in per_slice) == total  # all rows used
        assert len(set(
            len(part)
            for part in per_slice)) < 3  # only 1 or 2 lengths possible
        sort_idx = per_slice
    else:
        sort_idx = None
    if options.sort_across_slices:
        hashlabel = None
    else:
        hashlabel = d.hashlabel
    if len(ds_list) == 1:
        filename = d.filename
    else:
        filename = None
    dw = DatasetWriter(
        columns=d.columns,
        caption=params.caption,
        hashlabel=hashlabel,
        filename=filename,
        previous=datasets.previous,
    )
    return dw, ds_list, sort_idx
def _verify(name, types, data, coltype, want, default, want_fail, kw):
	if callable(want):
		check = want
	else:
		def check(got, fromstr, filtered=False):
			want1 = want if isinstance(want, list) else want[typ]
			if filtered:
				want1 = want1[::2]
			assert got == want1, 'Expected %r, got %r from %s.' % (want1, got, fromstr,)
	dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'})
	dw.set_slice(0)
	for ix, v in enumerate(data):
		dw.write(v, b'1' if ix % 2 == 0 else b'skip')
	for sliceno in range(1, g.slices):
		dw.set_slice(sliceno)
	bytes_ds = dw.finish()
	for typ in types:
		opts = dict(column2type=dict(data=typ))
		opts.update(kw)
		if default is not no_default:
			opts['defaults'] = {'data': default}
		try:
			jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts)
		except JobError:
			if want_fail:
				continue
			raise Exception('Typing %r as %s failed.' % (bytes_ds, typ,))
		assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % (bytes_ds, typ, jid)
		typed_ds = Dataset(jid)
		got = list(typed_ds.iterate(0, 'data'))
		check(got, '%s (typed as %s from %r)' % (typed_ds, typ, bytes_ds,))
		if 'filter_bad' not in opts and not callable(want):
			opts['filter_bad'] = True
			opts['column2type']['extra'] = 'int32_10'
			jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts)
			typed_ds = Dataset(jid)
			got = list(typed_ds.iterate(0, 'data'))
			check(got, '%s (typed as %s from %r with every other line skipped from filter_bad)' % (typed_ds, typ, bytes_ds,), True)
		used_type(typ)
def analysis(sliceno, prepare_res, job):
    dw_default = job.datasetwriter()
    dw_named = DatasetWriter(name="named")
    dw_passed, num = prepare_res
    dw_default.write(a=sliceno, b="a")
    dw_default.write_list([num, str(sliceno)])
    dw_named.write(True, date(1536, 12, min(sliceno + 1, 31)))
    dw_named.write_dict({"c": False, "d": date(2236, 5, min(sliceno + 1, 31))})
    # slice 0 is written in synthesis
    if 0 < sliceno < test_data.value_cnt:
        dw_passed.write_dict(
            {k: v[sliceno]
             for k, v in test_data.data.items()})
def synthesis(job):
    manual_chain = [Dataset(jobids.selfchain, name) for name in "abcdefgh"]
    manual_abf = [manual_chain[0], manual_chain[1], manual_chain[5]]
    # build a local abf chain
    prev = None
    for ix, ds in enumerate(manual_abf):
        name = "abf%d" % (ix, )
        prev = ds.link_to_here(name, override_previous=prev)
    manual_abf_data = list(Dataset.iterate_list(None, None, manual_abf))
    local_abf_data = list(Dataset(job, "abf2").iterate_chain(None, None))
    assert manual_abf_data == local_abf_data
    # disconnect h, verify there is no chain
    manual_chain[-1].link_to_here("alone", override_previous=None)
    assert len(Dataset(job, "alone").chain()) == 1
    # check that the original chain is unhurt
    assert manual_chain == manual_chain[-1].chain()

    # So far so good, now make a chain long enough to have a cache.
    prev = None
    ix = 0
    going = True
    while going:
        if prev and "cache" in prev._data:
            going = False
        name = "longchain%d" % (ix, )
        dw = DatasetWriter(name=name, previous=prev)
        dw.add("ix", "number")
        dw.get_split_write()(ix)
        prev = dw.finish()
        ix += 1
    # we now have a chain that goes one past the first cache point
    full_chain = Dataset(prev).chain()
    assert "cache" in full_chain[
        -2]._data  # just to check the above logic is correct
    assert "cache" not in full_chain[-1]._data  # just to be sure..
    full_chain[-2].link_to_here("nocache", override_previous=None)
    full_chain[-1].link_to_here("withcache", override_previous=full_chain[-3])
    assert "cache" not in Dataset(job, "nocache")._data
    assert "cache" in Dataset(job, "withcache")._data
    # And make sure they both get the right data too.
    assert list(Dataset(prev).iterate_chain(None, "ix")) == list(range(ix))
    assert list(Dataset(job, "nocache").iterate_chain(None, "ix")) == [ix - 2]
    assert list(Dataset(job, "withcache").iterate_chain(
        None, "ix")) == list(range(ix - 2)) + [ix - 1]
Example #19
0
def test_rehash_with_empty_slices():
    dw = DatasetWriter(name='rehash with empty slices', hashlabel='a')
    dw.add('a', 'ascii')
    dw.add('b', 'ascii')
    w = dw.get_split_write()
    w('a', '42')
    w('42', 'b')
    source = dw.finish()
    hashfunc = typed_writer('int32').hash

    def verify_hashing(caption, want_values, **kw):
        ds = subjobs.build('dataset_type',
                           source=source,
                           column2type=dict(a='int32_10'),
                           caption=caption,
                           **kw).dataset()
        got_values = set()
        for sliceno in range(g.slices):
            for got in ds.iterate(sliceno):
                assert hashfunc(got[0]) % g.slices == sliceno
                assert got not in got_values
                got_values.add(got)
        assert want_values == got_values

    verify_hashing('with discard', {(
        42,
        'b',
    )}, filter_bad=True)
    # using defaults uses some different code paths
    verify_hashing('with default=0 (probably two slices)', {(
        0,
        '42',
    ), (
        42,
        'b',
    )},
                   defaults=dict(a='0'))
    verify_hashing('with default=42 (one slice)', {(
        42,
        '42',
    ), (
        42,
        'b',
    )},
                   defaults=dict(a='42'))
Example #20
0
def prepare(params):
    assert params.slices >= 2, "Hashing won't do anything with just one slice"
    dws = DotDict()
    for name, hashlabel in (
        ("unhashed_manual", None),  # manually interlaved
        ("unhashed_split", None),  # split_write interlaved
        ("up_checked", "up"),  # hashed on up using dw.hashcheck
        ("up_split", "up"),  # hashed on up using split_write
        ("down_checked", "down"),  # hashed on down using dw.hashcheck
        ("down_discarded", "down"),  # hashed on down using discarding writes
        ("down_discarded_list",
         "down"),  # hashed on down using discarding list writes
        ("down_discarded_dict",
         "down"),  # hashed on down using discarding dict writes
    ):
        dw = DatasetWriter(name=name, hashlabel=hashlabel)
        dw.add("up", "int32")
        dw.add("down", "int32")
        dws[name] = dw
    return dws
Example #21
0
def prepare():
    return DatasetWriter(columns={t: t for t in test_data.data})
def synthesis(prepare_res, slices, job):
    dw_passed, _ = prepare_res
    # Using set_slice on a dataset that was written in analysis is not
    # actually supported, but since it currently works (as long as that
    # particular slice wasn't written in analysis) let's test it.
    dw_passed.set_slice(0)
    dw_passed.write(**{k: v[0] for k, v in test_data.data.items()})
    dw_synthesis_split = DatasetWriter(name="synthesis_split", hashlabel="a")
    dw_synthesis_split.add("a", "int32")
    dw_synthesis_split.add("b", "unicode")
    dw_synthesis_split.get_split_write()(1, "a")
    dw_synthesis_split.get_split_write_list()([2, "b"])
    dw_synthesis_split.get_split_write_dict()({"a": 3, "b": "c"})
    dw_synthesis_manual = job.datasetwriter(name="synthesis_manual",
                                            columns={"sliceno": "int32"})
    dw_nonetest = job.datasetwriter(name="nonetest",
                                    columns={t: t
                                             for t in test_data.data})
    for sliceno in range(slices):
        dw_synthesis_manual.set_slice(sliceno)
        dw_synthesis_manual.write(sliceno)
        dw_nonetest.set_slice(sliceno)
        dw_nonetest.write(
            **{
                k: v[0] if k in test_data.not_none_capable else None
                for k, v in test_data.data.items()
            })
Example #23
0
def test_filter_bad_across_types():
    columns = {
        'bytes': 'bytes',
        'float64': 'bytes',
        'int32_10': 'ascii',
        'json': 'unicode',
        'number:int': 'unicode',
        'unicode:utf-8': 'bytes',
    }
    # all_good, *values
    # Make sure all those types (except bytes) can filter other lines,
    # and be filtered by other lines. And that several filtering values
    # is not a problem (line 11).
    data = [
        [
            True,
            b'first',
            b'1.1',
            '1',
            '"a"',
            '001',
            b'ett',
        ],
        [
            True,
            b'second',
            b'2.2',
            '2',
            '"b"',
            '02',
            b'tv\xc3\xa5',
        ],
        [
            True,
            b'third',
            b'3.3',
            '3',
            '["c"]',
            '3.0',
            b'tre',
        ],
        [
            False,
            b'fourth',
            b'4.4',
            '4',
            '"d"',
            '4.4',
            b'fyra',
        ],  # number:int bad
        [
            False,
            b'fifth',
            b'5.5',
            '-',
            '"e"',
            '5',
            b'fem',
        ],  # int32_10 bad
        [
            False,
            b'sixth',
            b'6.b',
            '6',
            '"f"',
            '6',
            b'sex',
        ],  # float64 bad
        [
            False,
            b'seventh',
            b'7.7',
            '7',
            '{"g"}',
            '7',
            b'sju',
        ],  # json bad
        [
            False,
            b'eigth',
            b'8.8',
            '8',
            '"h"',
            '8',
            b'\xa5\xc3tta',
        ],  # unicode:utf-8 bad
        [
            True,
            b'ninth',
            b'9.9',
            '9',
            '"i"',
            '9',
            b'nio',
        ],
        [
            True,
            b'tenth',
            b'10',
            '10',
            '"j"',
            '10',
            b'tio',
        ],
        [
            False,
            b'eleventh',
            b'11a',
            '1-',
            '"k",',
            '1,',
            b'elva',
        ],  # float64, int32_10 and number:int bad
        [
            True,
            b'twelfth',
            b'12',
            '12',
            '"l"',
            '12',
            b'tolv',
        ],
    ]
    want_bad = [tuple(l[1:]) for l in data if not l[0]]
    dw = DatasetWriter(name="filter bad across types",
                       columns=columns,
                       allow_missing_slices=True)
    cols_to_check = ['int32_10', 'bytes', 'json', 'unicode:utf-8']
    if PY3:
        # z so it sorts last.
        dw.add('zpickle', 'pickle')
        cols_to_check.append('zpickle')
        for ix in range(len(data)):
            data[ix].append({ix})
    dw.set_slice(0)
    want = []

    def add_want(ix):
        v = data[ix]
        want.append((
            int(v[3]),
            v[1],
            json.loads(v[4]),
            v[6].decode('utf-8'),
        ))
        if PY3:
            want[-1] = want[-1] + (v[7], )

    for ix, v in enumerate(data):
        if v[0]:
            add_want(ix)
        dw.write(*v[1:])
    source_ds = dw.finish()
    # Once with just filter_bad, once with some defaults too.
    defaults = {}
    for _ in range(2):
        jid = subjobs.build(
            'dataset_type',
            datasets=dict(source=source_ds),
            options=dict(column2type={t: t
                                      for t in columns},
                         filter_bad=True,
                         defaults=defaults),
        )
        typed_ds = Dataset(jid)
        got = list(typed_ds.iterate(0, cols_to_check))
        assert got == want, "Expected %r, got %r from %s (from %r%s)" % (
            want, got, typed_ds, source_ds,
            ' with defaults' if defaults else '')
        bad_ds = Dataset(jid, 'bad')
        got_bad = list(bad_ds.iterate(0, sorted(columns)))
        assert got_bad == want_bad, "Expected %r, got %r from %s (from %r%s)" % (
            want_bad, got_bad, bad_ds, source_ds,
            ' with defaults' if defaults else '')
        # make more lines "ok" for the second lap
        if not defaults:
            want_bad.pop(0)  # number:int
            want_bad.pop(1)  # float64
            want_bad.pop(1)  # json
        defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
        add_want(3)
        add_want(5)
        data[6][4] = '"replacement"'
        add_want(6)
        want.sort()  # adding them out of order, int32_10 sorts correctly.
Example #24
0
def synthesis(params):
    ds = write(data)
    for colname in data[0]:
        verify(params.slices, data, ds, hashlabel=colname)
    # ok, all the hashing stuff works out, let's test the chaining options.
    bonus_ds = write(bonus_data, name="bonus", previous=ds)
    # no chaining options - full chain
    verify(params.slices, data + bonus_data, bonus_ds, hashlabel="date")
    # just the bonus ds
    verify(params.slices, bonus_data, bonus_ds, hashlabel="date", length=1)
    # built as a chain
    verify(params.slices,
           data + bonus_data,
           bonus_ds,
           hashlabel="date",
           as_chain=True)
    # normal chaining
    a = verify(params.slices, data, ds, hashlabel="date")
    b = verify(params.slices,
               data + bonus_data,
               bonus_ds,
               hashlabel="date",
               previous=a)
    assert b.chain() == [
        a, b
    ], "chain of %s is not [%s, %s] as expected" % (b, a, b)
    # as_chain sparseness
    dw = DatasetWriter(columns=columns, name="empty")
    dw.get_split_write()
    ds = verify(params.slices, [],
                dw.finish(),
                hashlabel="date",
                as_chain=True)
    assert len(
        ds.chain()
    ) == 1, ds + ": dataset_hashpart on empty dataset with as_chain=True did not produce a single dataset"
    # two populated slices with the same data, should end up in two datasets.
    dw = DatasetWriter(columns=columns, name="0 and 2")
    dw.set_slice(0)
    dw.write_dict(data[0])
    dw.set_slice(1)
    dw.set_slice(2)
    dw.write_dict(data[0])
    for s in range(3, params.slices):
        dw.set_slice(s)
    ds = verify(params.slices, [data[0]],
                dw.finish(),
                hashlabel="date",
                as_chain=True)
    got_slices = len(ds.chain())
    assert got_slices == 2, "%s (built with as_chain=True) has %d datasets in chain, expected 2." % (
        ds,
        got_slices,
    )
Example #25
0
def write(data, **kw):
    dw = DatasetWriter(columns=columns, **kw)
    w = dw.get_split_write_dict()
    for values in data:
        w(values)
    return dw.finish()
Example #26
0
def prepare(params):
    if options.trigger_column:
        assert options.sort_across_slices, 'trigger_column is meaningless without sort_across_slices'
        assert options.trigger_column in options.sort_columns, 'can only trigger on a column that is sorted on'
    d = datasets.source
    ds_list = d.chain(stop_ds={datasets.previous: 'source'})
    if options.sort_across_slices:
        columniter = partial(Dataset.iterate_list, None, datasets=ds_list)
        sort_idx, sort_extra = sort(columniter)
        total = len(sort_idx)
        per_slice = [total // params.slices] * params.slices
        extra = total % params.slices
        if extra:
            # spread the left over length over pseudo-randomly selected slices
            # (using the start of sort_idx to select slices).
            # this will always select the first slices if data is already sorted
            # but at least it's deterministic.
            selector = sorted(range(min(params.slices, total)),
                              key=sort_idx.__getitem__)
            for sliceno in selector[:extra]:
                per_slice[sliceno] += 1
        # Switch to tracking what line the slices end at
        slice_end = []
        end = 0
        for cnt in per_slice:
            end += cnt
            slice_end.append(end)
        if options.trigger_column:
            # extra definitely changed value last to simplify loop
            sort_extra.append(object())
            sort_idx.append(-1)

            # move slice_end counts around to only switch when trigger_column changes
            def fixup_fwd(cnt):
                trigger_v = sort_extra[sort_idx[cnt - 1]]
                while trigger_v == sort_extra[sort_idx[cnt]]:
                    cnt += 1
                return cnt

            def fixup_bck(cnt, min_cnt):
                trigger_v = sort_extra[sort_idx[cnt - 1]]
                while cnt > min_cnt and trigger_v == sort_extra[sort_idx[cnt]]:
                    cnt -= 1
                return cnt

            with status('Adjusting for trigger_column'):
                prev = 0
                for sliceno, cnt in enumerate(slice_end[:-1]):
                    if cnt:
                        cnt = max(cnt, prev)
                        choosen = fwd = fixup_fwd(cnt)
                        bck = fixup_bck(cnt, prev)
                        # This could be smarter
                        if (cnt - bck) <= (fwd < cnt):
                            choosen = bck
                        prev = slice_end[sliceno] = choosen
        # and now switch sort_idx to be per slice
        sort_idx = [
            sort_idx[start:end]
            for start, end in zip([0] + slice_end, slice_end)
        ]
        assert sum(len(part) for part in sort_idx) == total  # all rows used
        if not options.trigger_column:
            assert len(set(
                len(part)
                for part in sort_idx)) < 3  # only 1 or 2 lengths possible
    else:
        sort_idx = None
    if options.sort_across_slices:
        hashlabel = None
    else:
        hashlabel = d.hashlabel
    if len(ds_list) == 1:
        filename = d.filename
    else:
        filename = None
    dw = DatasetWriter(
        columns=d.columns,
        caption=params.caption,
        hashlabel=hashlabel,
        filename=filename,
        previous=datasets.previous,
    )
    return dw, ds_list, sort_idx
Example #27
0
def synthesis(job):
    dw = DatasetWriter(name='empty', columns={'v': 'ascii'})
    dw.get_split_write()
    empty_ds = dw.finish()
    assert empty_ds.min('non-existant column') is empty_ds.max(
        'non-existant column'
    ) is None, 'Dataset.min/max() broken for non-existant columns'
    for typ, groups in tests.items():
        t_ds = subjobs.build('dataset_type',
                             column2type={
                                 'v': typ
                             },
                             source=empty_ds).dataset()
        minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max)

        if minmax != (None, None):
            raise Exception(
                'Typing empty dataset as %s did not give minmax == None, gave %r'
                % (
                    typ,
                    minmax,
                ))
        all_names = list(
            chain.from_iterable(groupdata[group].keys() for group in groups))
        # just 1 and 2, so we don't make way too many
        for num_groups in (
                1,
                2,
        ):
            for names in combinations(all_names, num_groups):
                ds, mn, mx = make_source(names)
                t_ds = subjobs.build('dataset_type',
                                     column2type={
                                         'v': typ
                                     },
                                     source=ds).dataset()
                got_minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max)
                want_minmax = (mn, mx)
                chk_minmax(
                    got_minmax, want_minmax,
                    'Typing %s as %s gave wrong minmax: expected %r, got %r (in %s)'
                    % (
                        ds,
                        typ,
                        want_minmax,
                        got_minmax,
                        t_ds,
                    ))
                chk_minmax(got_minmax, (t_ds.min('v'), t_ds.max('v')),
                           'Dataset.min/max() broken on ' + t_ds)
                # verify writing the same data normally also gives the correct result
                dw = DatasetWriter(name='rewrite ' + t_ds,
                                   columns=t_ds.columns)
                write = dw.get_split_write()
                for v in t_ds.iterate(None, 'v'):
                    write(v)
                re_ds = dw.finish()
                got_minmax = (re_ds.columns['v'].min, re_ds.columns['v'].max)
                want_minmax = (mn, mx)
                chk_minmax(
                    got_minmax, want_minmax,
                    'Rewriting %s gave the wrong minmax: expected %r, got %r (in %s)'
                    % (
                        t_ds,
                        want_minmax,
                        got_minmax,
                        re_ds,
                    ))
Example #28
0
def prepare(job, slices):
	# use 256 as a marker value, because that's not a possible char value (assuming 8 bit chars)
	lf_char = char2int("newline", 256)
	# separator uses lf_char or \n as the empty value, because memchr might mishandle 256.
	separator = char2int("separator", 10 if lf_char == 256 else lf_char)
	comment_char = char2int("comment", 256)
	if options.quotes == 'True':
		quote_char = 256
	elif options.quotes == 'False':
		quote_char = 257
	else:
		quote_char = char2int("quotes", 257, "True/False/empty")
	filename = os.path.join(job.source_directory, options.filename)
	orig_filename = filename
	assert 1 <= options.compression <= 9

	fds = [os.pipe() for _ in range(slices)]
	read_fds = [t[0] for t in fds]
	write_fds = [t[1] for t in fds]

	if options.labelsonfirstline:
		labels_rfd, labels_wfd = os.pipe()
	else:
		labels_wfd = -1
	success_rfd, success_wfd = os.pipe()
	status_rfd, status_wfd = os.pipe()

	p = Process(target=reader_process, name="reader", args=(slices, filename, write_fds, labels_wfd, success_wfd, status_wfd, comment_char, lf_char))
	p.start()
	for fd in write_fds:
		os.close(fd)
	os.close(success_wfd)
	os.close(status_wfd)

	if options.labelsonfirstline:
		os.close(labels_wfd)
		# re-use import logic
		out_fns = ["labels"]
		r_num = cstuff.mk_uint64(3)
		res = cstuff.backend.import_slice(*cstuff.bytesargs(labels_rfd, -1, -1, -1, out_fns, b"wb1", separator, r_num, quote_char, lf_char, 0))
		os.close(labels_rfd)
		assert res == 0, "c backend failed in label parsing"
		with typed_reader("bytes")("labels") as fh:
			labels_from_file = [lab.decode("utf-8", "backslashreplace") for lab in fh]
		os.unlink("labels")
	else:
		labels_from_file = None

	labels = options.labels or labels_from_file
	assert labels, "No labels"
	labels = [options.rename.get(x, x) for x in labels]
	assert '' not in labels, "Empty label for column %d" % (labels.index(''),)
	assert len(labels) == len(set(labels)), "Duplicate labels: %r" % (labels,)

	dw = DatasetWriter(
		columns={n: 'bytes' for n in labels if n not in options.discard},
		filename=orig_filename,
		caption='csvimport of ' + orig_filename,
		previous=datasets.previous,
		meta_only=True,
	)
	if options.lineno_label:
		dw.add(options.lineno_label, "int64")

	if options.allow_bad:
		bad_dw = DatasetWriter(
			name="bad",
			columns=dict(lineno="int64", data="bytes"),
			caption='bad lines from csvimport of ' + orig_filename,
			meta_only=True,
		)
	else:
		bad_dw = None

	if options.comment or options.skip_lines:
		skipped_dw = DatasetWriter(
			name="skipped",
			columns=dict(lineno="int64", data="bytes"),
			caption='skipped lines from csvimport of ' + orig_filename,
			meta_only=True,
		)
	else:
		skipped_dw = None

	return separator, quote_char, lf_char, filename, orig_filename, labels, dw, bad_dw, skipped_dw, read_fds, success_rfd, status_rfd,
def mk_dw(name, cols, **kw):
    dw = DatasetWriter(name=name, **kw)
    for colname in cols:
        dw.add(colname, "unicode")
    return dw
def prepare(params):
    assert params.slices >= 3
    dw_3 = DatasetWriter(name="three", columns={"num": "int32"})
    dw_long = DatasetWriter(name="long", columns={"num": "int32"})
    dw_uneven = DatasetWriter(name="uneven", columns={"num": "int32"})
    return dw_3, dw_long, dw_uneven