Example #1
0
def prepare(params):
    d = datasets.source
    caption = options.caption % dict(caption=d.caption,
                                     hashlabel=options.hashlabel)
    prev_p = job_params(datasets.previous, default_empty=True)
    prev_source = prev_p.datasets.source
    if len(d.chain(stop_jobid=prev_source, length=options.length)) == 1:
        filename = d.filename
    else:
        filename = None
    dws = []
    previous = datasets.previous
    for sliceno in range(params.slices):
        if options.as_chain and sliceno == params.slices - 1:
            name = "default"
        else:
            name = str(sliceno)
        dw = DatasetWriter(
            caption="%s (slice %d)" % (caption, sliceno),
            hashlabel=options.hashlabel,
            filename=filename,
            previous=previous,
            name=name,
            for_single_slice=sliceno,
        )
        previous = (params.jobid, name)
        dws.append(dw)
    names = []
    for n, c in d.columns.items():
        # names has to be in the same order as the add calls
        # so the iterator returns the same order the writer expects.
        names.append(n)
        for dw in dws:
            dw.add(n, c.type)
    return dws, names, prev_source, caption, filename
def prepare():
	from dataset import DatasetWriter
	# previous allows chaining this method, should you wish to do so
	dw = DatasetWriter(previous=datasets.previous)
	dw.add('a string', 'ascii')  # ascii is not "any string", use 'unicode' for that
	dw.add('large number', 'number') # number is any (real) number, a float or int of any size
	dw.add('small number', 'number')
	dw.add('small integer', 'int32') # int32 is a signed 32 bit number
	dw.add('gauss number', 'number')
	dw.add('gauss float', 'float64') # float64 is what many other languages call double
	return dw
def prepare(params):
    columns = dict(
        bytes="bytes",
        float="float64",
        int="int64",
        json="json",
        unicode="unicode",
    )
    a = DatasetWriter(name="a", columns=columns)
    b = DatasetWriter(name="b", columns=columns, previous=(params.jobid, "a"))
    c = DatasetWriter(name="c", columns=columns)
    return a, b, c
Example #4
0
def prepare():
    d = datasets.source
    columns = {}
    for colname, coltype in options.column2type.iteritems():
        assert d.columns[colname].type in (
            'bytes',
            'ascii',
        ), colname
        coltype = coltype.split(':', 1)[0]
        columns[options.rename.get(colname,
                                   colname)] = dataset_typing.typerename.get(
                                       coltype, coltype)
    if options.filter_bad or options.discard_untyped:
        assert options.discard_untyped is not False, "Can't keep untyped when filtering bad"
        parent = None
    else:
        parent = datasets.source
    return DatasetWriter(
        columns=columns,
        caption=options.caption,
        hashlabel=options.rename.get(d.hashlabel, d.hashlabel),
        hashlabel_override=True,
        parent=parent,
        previous=datasets.previous,
        meta_only=True,
    )
def prepare(params):
    assert params.slices >= test_data.value_cnt
    dw_default = DatasetWriter()
    dw_default.add("a", "number")
    dw_default.add("b", "ascii")
    DatasetWriter(name="named", columns={"c": "bool", "d": "date"})
    dw_passed = DatasetWriter(name="passed",
                              columns={t: t
                                       for t in test_data.data})
    return dw_passed, 42
Example #6
0
def synthesis(prepare_res, params):
    if not options.as_chain:
        # If we don't want a chain we abuse our knowledge of dataset internals
        # to avoid recompressing. Don't do this stuff yourself.
        dws, names, prev_source, caption, filename = prepare_res
        merged_dw = DatasetWriter(
            caption=caption,
            hashlabel=options.hashlabel,
            filename=filename,
            previous=datasets.previous,
            meta_only=True,
            columns=datasets.source.columns,
        )
        for sliceno in range(params.slices):
            merged_dw.set_lines(sliceno, sum(dw._lens[sliceno] for dw in dws))
            for dwno, dw in enumerate(dws):
                merged_dw.set_minmax((sliceno, dwno), dw._minmax[sliceno])
            for n in names:
                fn = merged_dw.column_filename(n, sliceno=sliceno)
                with open(fn, "wb") as out_fh:
                    for dw in dws:
                        fn = dw.column_filename(n, sliceno=sliceno)
                        with open(fn, "rb") as in_fh:
                            copyfileobj(in_fh, out_fh)
        for dw in dws:
            dw.discard()
Example #7
0
def synthesis(jobid):
    manual_chain = [Dataset(jobids.selfchain, name) for name in "abcdefgh"]
    manual_abf = [manual_chain[0], manual_chain[1], manual_chain[5]]
    # build a local abf chain
    prev = None
    for ix, ds in enumerate(manual_abf):
        name = "abf%d" % (ix, )
        ds.link_to_here(name, override_previous=prev)
        prev = (
            jobid,
            name,
        )
    manual_abf_data = list(Dataset.iterate_list(None, None, manual_abf))
    local_abf_data = list(Dataset(jobid, "abf2").iterate_chain(None, None))
    assert manual_abf_data == local_abf_data
    # disconnect h, verify there is no chain
    manual_chain[-1].link_to_here("alone", override_previous=None)
    assert len(Dataset(jobid, "alone").chain()) == 1
    # check that the original chain is unhurt
    assert manual_chain == manual_chain[-1].chain()

    # So far so good, now make a chain long enough to have a cache.
    prev = None
    ix = 0
    going = True
    while going:
        if prev and "cache" in Dataset(prev)._data:
            going = False
        name = "longchain%d" % (ix, )
        dw = DatasetWriter(name=name, previous=prev)
        dw.add("ix", "number")
        dw.get_split_write()(ix)
        dw.finish()
        prev = (
            jobid,
            name,
        )
        ix += 1
    # we now have a chain that goes one past the first cache point
    full_chain = Dataset(prev).chain()
    assert "cache" in full_chain[
        -2]._data  # just to check the above logic is correct
    assert "cache" not in full_chain[-1]._data  # just to be sure..
    full_chain[-2].link_to_here("nocache", override_previous=None)
    full_chain[-1].link_to_here("withcache", override_previous=full_chain[-3])
    assert "cache" not in Dataset(jobid, "nocache")._data
    assert "cache" in Dataset(jobid, "withcache")._data
    # And make sure they both get the right data too.
    assert list(Dataset(prev).iterate_chain(None, "ix")) == list(range(ix))
    assert list(Dataset(jobid, "nocache").iterate_chain(None,
                                                        "ix")) == [ix - 2]
    assert list(Dataset(jobid, "withcache").iterate_chain(
        None, "ix")) == list(range(ix - 2)) + [ix - 1]
Example #8
0
def real_prepare(d, previous, options):
	column_types = {n: c.type for n, c in d.columns.items()}
	column_sizes = []
	column_names = list(column_types)
	column_names.remove(options.date_column)
	column_names.insert(0, options.date_column)
	minmax_typeidx = []
	for colname in column_names:
		typ = column_types[colname]
		column_sizes.append(dataset_typing.typesizes[typ])
		minmax_typeidx.append(minmax_type2idx.get(typ, -1))
	minmax_typeidx = ffi.new('int []', minmax_typeidx)
	kw = dict(
		columns=column_types,
		hashlabel=d.hashlabel,
		caption=options.caption,
		previous=previous,
		meta_only=True,
	)
	dw = DatasetWriter(**kw)
	dw_spill = DatasetWriter(name='SPILL', **kw)
	return dw, dw_spill, column_names, column_sizes, column_types, minmax_typeidx
Example #9
0
def prepare(params):
    d = datasets.source
    ds_list = d.chain(stop_ds={datasets.previous: 'source'})
    if options.sort_across_slices:
        columniter = partial(Dataset.iterate_list, None, datasets=ds_list)
        sort_idx = sort(columniter)
        total = len(sort_idx)
        per_slice = [total // params.slices] * params.slices
        extra = total % params.slices
        if extra:
            # spread the left over length over pseudo-randomly selected slices
            # (using the start of sort_idx to select slices).
            # this will always select the first slices if data is already sorted
            # but at least it's deterministic.
            selector = sorted(range(min(params.slices, total)),
                              key=sort_idx.__getitem__)
            for sliceno in selector[:extra]:
                per_slice[sliceno] += 1
        # change per_slice to be the actual sort indexes
        start = 0
        for ix, num in enumerate(per_slice):
            end = start + num
            per_slice[ix] = sort_idx[start:end]
            start = end
        assert sum(len(part) for part in per_slice) == total  # all rows used
        assert len(set(
            len(part)
            for part in per_slice)) < 3  # only 1 or 2 lengths possible
        sort_idx = per_slice
    else:
        sort_idx = None
    if options.sort_across_slices:
        hashlabel = None
    else:
        hashlabel = d.hashlabel
    if len(ds_list) == 1:
        filename = d.filename
    else:
        filename = None
    dw = DatasetWriter(
        columns=d.columns,
        caption=params.caption,
        hashlabel=hashlabel,
        filename=filename,
    )
    return dw, ds_list, sort_idx
Example #10
0
def prepare(params):
    dws = {}
    prev = None
    for name in "abcdefgh":
        dw = DatasetWriter(name=name, previous=prev)
        dw.add("ds", "ascii")
        dw.add("num", "number")
        dws[name] = dw
        prev = "%s/%s" % (
            params.jobid,
            name,
        )
    return dws
Example #11
0
def prepare(params):
	d = datasets.source
	jobs = d.chain(stop_jobid={datasets.previous: 'source'})
	if options.sort_across_slices:
		columniter = partial(Dataset.iterate_list, None, jobids=jobs)
		sort_idx = sort(columniter)
	else:
		sort_idx = None
	if options.sort_across_slices:
		hashlabel = None
	else:
		hashlabel = d.hashlabel
	if len(jobs) == 1:
		filename = d.filename
	else:
		filename = None
	dw = DatasetWriter(
		columns=d.columns,
		caption=params.caption,
		hashlabel=hashlabel,
		filename=filename,
	)
	return dw, jobs, sort_idx
Example #12
0
def prepare(params):
    assert params.slices >= 2, "Hashing won't do anything with just one slice"
    dws = DotDict()
    for name, hashlabel in (
        ("unhashed_manual", None),  # manually interlaved
        ("unhashed_split", None),  # split_write interlaved
        ("up_checked", "up"),  # hashed on up using dw.hashcheck
        ("up_split", "up"),  # hashed on up using split_write
        ("down_checked", "down"),  # hashed on down using dw.hashcheck
        ("down_discarded", "down"),  # hashed on down using discarding writes
        ("down_discarded_list",
         "down"),  # hashed on down using discarding list writes
        ("down_discarded_dict",
         "down"),  # hashed on down using discarding dict writes
    ):
        dw = DatasetWriter(name=name, hashlabel=hashlabel)
        dw.add("up", "int32")
        dw.add("down", "int32")
        dws[name] = dw
    return dws
def synthesis(params):
	dw = DatasetWriter(name="parent")
	in_parent = [ # list because order matters
		"-",      # becomes _ because everything must be a valid python identifier.
		"a b",    # becomes a_b because everything must be a valid python identifier.
		"42",     # becomes _42 because everything must be a valid python identifier.
		"print",  # becomes print_ because print is a keyword (in py2).
		"print@", # becomes print__ because print_ is taken.
		"None",   # becomes None_ because None is a keyword (in py3).
	]
	for colname in in_parent:
		dw.add(colname, "unicode")
	w = dw.get_split_write()
	w(_="- 1", a_b="a b 1", _42="42 1", print_="print 1", None_="None 1", print__="Will be overwritten 1")
	w(_="- 2", a_b="a b 2", _42="42 2", print_="print 2", None_="None 2", print__="Will be overwritten 2")
	parent = dw.finish()
	dw = DatasetWriter(name="child", parent=parent)
	in_child = [ # order still matters
		"print_*", # becomes print___ because print__ is taken.
		"print_",  # becomes print____ because all shorter are taken.
		"normal",  # no collision.
		"Normal",  # no collision.
		"print@",  # re-uses print__ from the parent dataset.
	]
	for colname in in_child:
		dw.add(colname, "unicode")
	w = dw.get_split_write()
	w(print__="print@ 1", print___="print_* 1", print____="print_ 1", normal="normal 1", Normal="Normal 1")
	w(print__="print@ 2", print___="print_* 2", print____="print_ 2", normal="normal 2", Normal="Normal 2")
	child = dw.finish()
	for colname in in_parent + in_child:
		data = set(child.iterate(None, colname))
		assert data == {colname + " 1", colname + " 2"}, "Bad data for %s: %r" % (colname, data)
def prepare():
    dw = DatasetWriter()
    dw.add("str", "ascii")
    dw.add("num", "number")
    return dw
def prepare():
    dw = DatasetWriter(parent=datasets.source)
    dw.add('prod', 'number')  # works for float as well as int
    return dw
Example #16
0
def prepare(SOURCE_DIRECTORY):
    separator = options.separator
    assert len(separator) == 1
    filename = os.path.join(SOURCE_DIRECTORY, options.filename)
    orig_filename = filename

    if filename.lower().endswith('.zip'):
        from zipfile import ZipFile
        filename = 'extracted'
        with ZipFile(orig_filename, 'r') as z:
            infos = z.infolist()
            assert len(
                infos
            ) == 1, 'There is only support for ZIP files with exactly one member.'
            # Wouldn't it be nice if ZipFile.extract let me choose the filename?
            with open(filename, 'wb') as ofh:
                zfh = z.open(infos[0])
                while True:
                    data = zfh.read(1024 * 1024)
                    if not data:
                        break
                    ofh.write(data)

    if options.labelsonfirstline:
        with gzutil.GzBytesLines(filename, strip_bom=True) as fh:
            labels_str = next(fh).decode('ascii', 'replace').encode(
                'ascii', 'replace')  # garbage -> '?'
        if options.quote_support:
            labels = []
            sep = options.separator
            while labels_str is not None:
                if labels_str.startswith((
                        '"',
                        "'",
                )):
                    q = labels_str[0]
                    pos = 1
                    while pos + 1 < len(labels_str):
                        pos = labels_str.find(q, pos)
                        if pos == -1:  # all is lost
                            pos = len(labels_str) - 1
                        if pos + 1 == len(labels_str):  # eol
                            break
                        if labels_str[pos + 1] == sep:
                            break
                        # we'll just assume it was a quote, because what else to do?
                        labels_str = labels_str[:pos] + labels_str[pos + 1:]
                        pos += 1
                    labels.append(labels_str[1:pos])
                    if len(labels_str) > pos + 1:
                        labels_str = labels_str[pos + 2:]
                    else:
                        labels_str = None
                else:
                    if sep in labels_str:
                        field, labels_str = labels_str.split(sep, 1)
                    else:
                        field, labels_str = labels_str, None
                    labels.append(field)
        else:
            labels = labels_str.split(options.separator)
    labels = options.labels or labels  # only from file if not specified in options
    assert labels, "No labels"
    labels = [options.rename.get(x, x) for x in labels]
    assert '' not in labels, "Empty label for column %d" % (labels.index(''), )
    assert len(labels) == len(set(labels)), "Duplicate labels: %r" % (labels, )

    dw = DatasetWriter(
        columns={n: 'bytes'
                 for n in labels},
        filename=orig_filename,
        hashlabel=options.hashlabel,
        caption='csvimport of ' + orig_filename,
        previous=datasets.previous,
        meta_only=True,
    )

    return separator, filename, orig_filename, labels, dw,
Example #17
0
def analysis_lap(sliceno, badmap_fh, first_lap):
    known_line_count = 0
    badmap_size = 0
    badmap_fd = -1
    res_bad_count = {}
    res_default_count = {}
    res_minmax = {}
    link_candidates = []
    if first_lap:
        record_bad = options.filter_bad
        skip_bad = 0
    else:
        record_bad = 0
        skip_bad = options.filter_bad
    minmax_fn = 'minmax%d' % (sliceno, )
    dw = DatasetWriter()
    for colname, coltype in iteritems(options.column2type):
        out_fn = dw.column_filename(options.rename.get(colname, colname))
        fmt = fmt_b = None
        if coltype in dataset_typing.convfuncs:
            shorttype = coltype
            _, cfunc, pyfunc = dataset_typing.convfuncs[coltype]
        else:
            shorttype, fmt = coltype.split(':', 1)
            _, cfunc, pyfunc = dataset_typing.convfuncs[shorttype + ':*']
        if cfunc:
            cfunc = shorttype.replace(':', '_')
        if pyfunc:
            tmp = pyfunc(coltype)
            if callable(tmp):
                pyfunc = tmp
                cfunc = None
            else:
                pyfunc = None
                cfunc, fmt, fmt_b = tmp
        if coltype == 'number':
            cfunc = 'number'
        elif coltype == 'number:int':
            coltype = 'number'
            cfunc = 'number'
            fmt = "int"
        assert cfunc or pyfunc, coltype + " didn't have cfunc or pyfunc"
        coltype = shorttype
        d = datasets.source
        assert d.columns[colname].type in byteslike_types, colname
        if options.filter_bad:
            line_count = d.lines[sliceno]
            if known_line_count:
                assert line_count == known_line_count, (colname, line_count,
                                                        known_line_count)
            else:
                known_line_count = line_count
                pagesize = getpagesize()
                badmap_size = (line_count // 8 // pagesize + 1) * pagesize
                badmap_fh.truncate(badmap_size)
                badmap_fd = badmap_fh.fileno()
        if d.columns[colname].backing_type.startswith('_v2_'):
            backing_format = 2
        else:
            backing_format = 3
        in_fn = d.column_filename(colname, sliceno)
        if d.columns[colname].offsets:
            offset = d.columns[colname].offsets[sliceno]
            max_count = d.lines[sliceno]
        else:
            offset = 0
            max_count = -1
        if cfunc:
            default_value = options.defaults.get(colname, ffi.NULL)
            default_len = 0
            if default_value is None:
                default_value = ffi.NULL
                default_value_is_None = True
            else:
                default_value_is_None = False
                if default_value != ffi.NULL:
                    if isinstance(default_value, unicode):
                        default_value = default_value.encode("utf-8")
                    default_len = len(default_value)
            bad_count = ffi.new('uint64_t [1]', [0])
            default_count = ffi.new('uint64_t [1]', [0])
            c = getattr(backend, 'convert_column_' + cfunc)
            res = c(*bytesargs(in_fn, out_fn, minmax_fn, default_value,
                               default_len, default_value_is_None, fmt, fmt_b,
                               record_bad, skip_bad, badmap_fd, badmap_size,
                               bad_count, default_count, offset, max_count,
                               backing_format))
            assert not res, 'Failed to convert ' + colname
            res_bad_count[colname] = bad_count[0]
            res_default_count[colname] = default_count[0]
            coltype = coltype.split(':', 1)[0]
            with type2iter[dataset_typing.typerename.get(
                    coltype, coltype)](minmax_fn) as it:
                res_minmax[colname] = list(it)
            unlink(minmax_fn)
        else:
            # python func
            nodefault = object()
            if colname in options.defaults:
                default_value = options.defaults[colname]
                if default_value is not None:
                    if isinstance(default_value, unicode):
                        default_value = default_value.encode('utf-8')
                    default_value = pyfunc(default_value)
            else:
                default_value = nodefault
            if options.filter_bad:
                badmap = mmap(badmap_fd, badmap_size)
                if PY2:
                    badmap = IntegerBytesWrapper(badmap)
            bad_count = 0
            default_count = 0
            dont_minmax_types = {'bytes', 'ascii', 'unicode', 'json'}
            real_coltype = dataset_typing.typerename.get(coltype, coltype)
            do_minmax = real_coltype not in dont_minmax_types
            with typed_writer(real_coltype)(out_fn) as fh:
                col_min = col_max = None
                for ix, v in enumerate(
                        d._column_iterator(sliceno,
                                           colname,
                                           _type='bytes' if backing_format == 3
                                           else '_v2_bytes')):
                    if skip_bad:
                        if badmap[ix // 8] & (1 << (ix % 8)):
                            bad_count += 1
                            continue
                    try:
                        v = pyfunc(v)
                    except ValueError:
                        if default_value is not nodefault:
                            v = default_value
                            default_count += 1
                        elif record_bad:
                            bad_count += 1
                            bv = badmap[ix // 8]
                            badmap[ix // 8] = bv | (1 << (ix % 8))
                            continue
                        else:
                            raise Exception(
                                "Invalid value %r with no default in %s" % (
                                    v,
                                    colname,
                                ))
                    if do_minmax and not isinstance(v, NoneType):
                        if col_min is None:
                            col_min = col_max = v
                        if v < col_min: col_min = v
                        if v > col_max: col_max = v
                    fh.write(v)
            if options.filter_bad:
                badmap.close()
            res_bad_count[colname] = bad_count
            res_default_count[colname] = default_count
            res_minmax[colname] = [col_min, col_max]
    return res_bad_count, res_default_count, res_minmax, link_candidates
Example #18
0
def write(data, **kw):
    dw = DatasetWriter(columns=columns, **kw)
    w = dw.get_split_write_dict()
    for values in data:
        w(values)
    return dw.finish()
Example #19
0
def analysis(sliceno, prepare_res):
    dw_default = DatasetWriter()
    dw_named = DatasetWriter(name="named")
    dw_passed, num = prepare_res
    dw_default.write(a=sliceno, b="a")
    dw_default.write_list([num, str(sliceno)])
    dw_named.write(True, date(1536, 12, min(sliceno + 1, 31)))
    dw_named.write_dict({"c": False, "d": date(2236, 5, min(sliceno + 1, 31))})
    # slice 0 is written in synthesis
    if 0 < sliceno < test_data.value_cnt:
        dw_passed.write_dict(
            {k: v[sliceno]
             for k, v in test_data.data.items()})
Example #20
0
def synthesis(prepare_res, params):
    dw_passed, _ = prepare_res
    # Using set_slice on a dataset that was written in analysis is not
    # actually supported, but since it currently works (as long as that
    # particular slice wasn't written in analysis) let's test it.
    dw_passed.set_slice(0)
    dw_passed.write(**{k: v[0] for k, v in test_data.data.items()})
    dw_synthesis_split = DatasetWriter(name="synthesis_split", hashlabel="a")
    dw_synthesis_split.add("a", "int32")
    dw_synthesis_split.add("b", "unicode")
    dw_synthesis_split.get_split_write()(1, "a")
    dw_synthesis_split.get_split_write_list()([2, "b"])
    dw_synthesis_split.get_split_write_dict()({"a": 3, "b": "c"})
    dw_synthesis_manual = DatasetWriter(name="synthesis_manual",
                                        columns={"sliceno": "int32"})
    dw_nonetest = DatasetWriter(name="nonetest",
                                columns={t: t
                                         for t in test_data.data})
    for sliceno in range(params.slices):
        dw_synthesis_manual.set_slice(sliceno)
        dw_synthesis_manual.write(sliceno)
        dw_nonetest.set_slice(sliceno)
        dw_nonetest.write(
            **{
                k: v[0] if k in test_data.not_none_capable else None
                for k, v in test_data.data.items()
            })
def _verify(name, types, data, coltype, want, default, want_fail, kw):
    if callable(want):
        check = want
    else:

        def check(got, fromstr, filtered=False):
            want1 = want if isinstance(want, list) else want[typ]
            if filtered:
                want1 = want1[::2]
            assert got == want1, 'Expected %r, got %r from %s.' % (
                want1,
                got,
                fromstr,
            )

    dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'})
    dw.set_slice(0)
    for ix, v in enumerate(data):
        dw.write(v, b'1' if ix % 2 == 0 else b'skip')
    for sliceno in range(1, g.SLICES):
        dw.set_slice(sliceno)
    bytes_ds = dw.finish()
    for typ in types:
        opts = dict(column2type=dict(data=typ))
        opts.update(kw)
        if default is not no_default:
            opts['defaults'] = {'data': default}
        try:
            jid = subjobs.build('dataset_type',
                                datasets=dict(source=bytes_ds),
                                options=opts)
        except JobError:
            if want_fail:
                continue
            raise Exception('Typing %r as %s failed.' % (
                bytes_ds,
                typ,
            ))
        assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % (
            bytes_ds, typ, jid)
        typed_ds = Dataset(jid)
        got = list(typed_ds.iterate(0, 'data'))
        check(got, '%s (typed as %s from %r)' % (
            typed_ds,
            typ,
            bytes_ds,
        ))
        if 'filter_bad' not in opts and not callable(want):
            opts['filter_bad'] = True
            opts['column2type']['extra'] = 'int32_10'
            jid = subjobs.build('dataset_type',
                                datasets=dict(source=bytes_ds),
                                options=opts)
            typed_ds = Dataset(jid)
            got = list(typed_ds.iterate(0, 'data'))
            check(
                got,
                '%s (typed as %s from %r with every other line skipped from filter_bad)'
                % (
                    typed_ds,
                    typ,
                    bytes_ds,
                ), True)
        used_type(typ)
def test_filter_bad_across_types():
    columns = {
        'bytes': 'bytes',
        'float64': 'bytes',
        'int32_10': 'ascii',
        'json': 'unicode',
        'number:int': 'unicode',
        'unicode:utf-8': 'bytes',
    }
    # all_good, *values
    # Make sure all those types (except bytes) can filter other lines,
    # and be filtered by other lines. And that several filtering values
    # is not a problem (line 11).
    data = [
        (
            True,
            b'first',
            b'1.1',
            '1',
            '"a"',
            '001',
            b'ett',
        ),
        (
            True,
            b'second',
            b'2.2',
            '2',
            '"b"',
            '02',
            b'tv\xc3\xa5',
        ),
        (
            True,
            b'third',
            b'3.3',
            '3',
            '["c"]',
            '3.0',
            b'tre',
        ),
        (
            False,
            b'fourth',
            b'4.4',
            '4',
            '"d"',
            '4.4',
            b'fyra',
        ),  # number:int bad
        (
            False,
            b'fifth',
            b'5.5',
            '-',
            '"e"',
            '5',
            b'fem',
        ),  # int32_10 bad
        (
            False,
            b'sixth',
            b'6.b',
            '6',
            '"f"',
            '6',
            b'sex',
        ),  # float64 bad
        [
            False,
            b'seventh',
            b'7.7',
            '7',
            '{"g"}',
            '7',
            b'sju',
        ],  # json bad
        (
            False,
            b'eigth',
            b'8.8',
            '8',
            '"h"',
            '8',
            b'\xa5\xc3tta',
        ),  # unicode:utf-8 bad
        (
            True,
            b'ninth',
            b'9.9',
            '9',
            '"i"',
            '9',
            b'nio',
        ),
        (
            True,
            b'tenth',
            b'10',
            '10',
            '"j"',
            '10',
            b'tio',
        ),
        (
            False,
            b'eleventh',
            b'11a',
            '1-',
            '"k",',
            '1,',
            b'elva',
        ),  # float64, int32_10 and number:int bad
        (
            True,
            b'twelfth',
            b'12',
            '12',
            '"l"',
            '12',
            b'tolv',
        ),
    ]
    dw = DatasetWriter(name="filter bad across types", columns=columns)
    dw.set_slice(0)
    want = []

    def add_want(v):
        want.append((
            int(v[3]),
            v[1],
            json.loads(v[4]),
            v[6].decode('utf-8'),
        ))

    for v in data:
        if v[0]:
            add_want(v)
        dw.write(*v[1:])
    for sliceno in range(1, g.SLICES):
        dw.set_slice(sliceno)
    source_ds = dw.finish()
    # Once with just filter_bad, once with some defaults too.
    defaults = {}
    for _ in range(2):
        jid = subjobs.build(
            'dataset_type',
            datasets=dict(source=source_ds),
            options=dict(column2type={t: t
                                      for t in columns},
                         filter_bad=True,
                         defaults=defaults),
        )
        typed_ds = Dataset(jid)
        got = list(
            typed_ds.iterate(0,
                             ['int32_10', 'bytes', 'json', 'unicode:utf-8']))
        assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (
            want, got, typed_ds, source_ds,
            ' with defaults' if defaults else '')
        # make more lines "ok" for the second lap
        defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
        add_want(data[3])
        add_want(data[5])
        data[6][4] = '"replacement"'
        add_want(data[6])
        want.sort()  # adding them out of order, int32_10 sorts correctly.
Example #23
0
def analysis_lap(sliceno, badmap_fh, first_lap):
    known_line_count = 0
    badmap_size = 0
    badmap_fd = -1
    res_bad_count = {}
    res_default_count = {}
    res_minmax = {}
    link_candidates = []
    if first_lap:
        record_bad = options.filter_bad
        skip_bad = 0
    else:
        record_bad = 0
        skip_bad = options.filter_bad
    minmax_fn = 'minmax%d' % (sliceno, )
    dw = DatasetWriter()
    for colname, coltype in options.column2type.iteritems():
        out_fn = dw.column_filename(options.rename.get(
            colname, colname)).encode('ascii')
        if ':' in coltype and not coltype.startswith('number:'):
            coltype, fmt = coltype.split(':', 1)
            _, cfunc, pyfunc = dataset_typing.convfuncs[coltype + ':*']
            if '%f' in fmt:
                # needs to fall back to python version
                cfunc = None
            if not cfunc:
                pyfunc = pyfunc(coltype, fmt)
        else:
            _, cfunc, pyfunc = dataset_typing.convfuncs[coltype]
            fmt = ffi.NULL
        d = datasets.source
        assert d.columns[colname].type in (
            'bytes',
            'ascii',
        ), colname
        if options.filter_bad:
            line_count = d.lines[sliceno]
            if known_line_count:
                assert line_count == known_line_count, (colname, line_count,
                                                        known_line_count)
            else:
                known_line_count = line_count
                pagesize = getpagesize()
                badmap_size = (line_count // 8 // pagesize + 1) * pagesize
                badmap_fh.truncate(badmap_size)
                badmap_fd = badmap_fh.fileno()
        if d.columns[colname].backing_type.startswith('_v2_'):
            backing_format = 2
        else:
            backing_format = 3
        in_fn = d.column_filename(colname, sliceno).encode('ascii')
        if d.columns[colname].offsets:
            offset = d.columns[colname].offsets[sliceno]
            max_count = d.lines[sliceno]
        else:
            offset = 0
            max_count = -1
        if coltype == 'number':
            cfunc = True
        if coltype == 'number:int':
            coltype = 'number'
            cfunc = True
            fmt = "int"
        if cfunc:
            default_value = options.defaults.get(colname, ffi.NULL)
            if default_value is None:
                default_value = ffi.NULL
                default_value_is_None = True
            else:
                default_value_is_None = False
            bad_count = ffi.new('uint64_t [1]', [0])
            default_count = ffi.new('uint64_t [1]', [0])
            c = getattr(backend, 'convert_column_' + coltype)
            res = c(in_fn, out_fn, minmax_fn, default_value,
                    default_value_is_None, fmt, record_bad, skip_bad,
                    badmap_fd, badmap_size, bad_count, default_count, offset,
                    max_count, backing_format)
            assert not res, 'Failed to convert ' + colname
            res_bad_count[colname] = bad_count[0]
            res_default_count[colname] = default_count[0]
            with type2iter[dataset_typing.typerename.get(
                    coltype, coltype)](minmax_fn) as it:
                res_minmax[colname] = list(it)
            unlink(minmax_fn)
        elif pyfunc is str:
            # We skip it the first time around, and link it from
            # the source dataset if there were no bad lines.
            # (That happens at the end of analysis.)
            # We can't do that if the file is not slice-specific though.
            # And we also can't do it if the column is in the wrong (old) format.
            if skip_bad or '%s' not in d.column_filename(
                    colname, '%s') or backing_format != 3:
                res = backend.filter_strings(in_fn, out_fn, badmap_fd,
                                             badmap_size, offset, max_count,
                                             backing_format)
                assert not res, 'Failed to convert ' + colname
            else:
                link_candidates.append((
                    in_fn,
                    out_fn,
                ))
            res_bad_count[colname] = 0
            res_default_count[colname] = 0
        elif pyfunc is str.strip:
            res = backend.filter_stringstrip(in_fn, out_fn, badmap_fd,
                                             badmap_size, offset, max_count,
                                             backing_format)
            assert not res, 'Failed to convert ' + colname
            res_bad_count[colname] = 0
            res_default_count[colname] = 0
        else:
            # python func
            nodefault = object()
            if colname in options.defaults:
                if options.defaults[colname] is None:
                    default_value = None
                else:
                    default_value = pyfunc(options.defaults[colname])
            else:
                default_value = nodefault
            if options.filter_bad:
                badmap = mmap(badmap_fd, badmap_size)
            bad_count = 0
            default_count = 0
            with typed_writer(dataset_typing.typerename.get(
                    coltype, coltype))(out_fn) as fh:
                col_min = col_max = None
                for ix, v in enumerate(d.iterate(sliceno, colname)):
                    if skip_bad:
                        if ord(badmap[ix // 8]) & (1 << (ix % 8)):
                            bad_count += 1
                            continue
                    try:
                        v = pyfunc(v)
                    except ValueError:
                        if default_value is not nodefault:
                            v = default_value
                            default_count += 1
                        elif record_bad:
                            bad_count += 1
                            bv = ord(badmap[ix // 8])
                            badmap[ix // 8] = chr(bv | (1 << (ix % 8)))
                            continue
                        else:
                            raise Exception(
                                "Invalid value %r with no default in %s" % (
                                    v,
                                    colname,
                                ))
                    if not isinstance(v, (
                            NoneType,
                            str,
                            unicode,
                    )):
                        if col_min is None:
                            col_min = col_max = v
                        if v < col_min: col_min = v
                        if v > col_max: col_max = v
                    fh.write(v)
            if options.filter_bad:
                badmap.close()
            res_bad_count[colname] = bad_count
            res_default_count[colname] = default_count
            res_minmax[colname] = [col_min, col_max]
    return res_bad_count, res_default_count, res_minmax, link_candidates
Example #24
0
def prepare():
    dw = DatasetWriter(previous=datasets.previous)
    dw.add('rflt', 'float64')
    dw.add('rint', 'int64')
    return dw
def prepare():
	return DatasetWriter(columns={t: t for t in test_data.data})
Example #26
0
def do_one(params, name, data):
	dw = DatasetWriter(name=name, columns=columns)
	dw.set_slice(0)
	for v in data:
		if v is None:
			d = dict(
				ascii_new=None,
				ascii_old=None,
				bytes_new=None,
				bytes_old=None,
				unicode_new=None,
				unicode_old=None,
			)
		else:
			d = dict(
				ascii_new=v,
				ascii_old=v,
				bytes_new=uni(v).encode("ascii"),
				bytes_old=uni(v).encode("ascii"),
				unicode_new=uni(v),
				unicode_old=uni(v),
			)
		dw.write_dict(d)
	# We don't really want the other slices, but write one thing to
	# each, to make sure it doesn't show up in slice 0.
	# (Small slice merging will put it in the same file, so this is
	# a real risk.)
	for sliceno in range(1, params.slices):
		dw.set_slice(sliceno)
		dw.write_dict(d)
	dw.finish()

	# verify we got what we asked for
	me_ds = Dataset(params.jobid, name)
	for colname, coltype in columns.items():
		col = me_ds.columns[colname]
		assert col.type == coltype.split("_")[-1], colname
		assert col.backing_type == coltype, colname
		for want, got in zip(data, me_ds.iterate(0, colname)):
			if want is not None:
				if PY2 and "unicode" in coltype:
					want = uni(want)
				if PY3 and "bytes" in coltype:
					want = want.encode("ascii")
			assert want == got, "%s in %s did not contain the expected value. Wanted %r but got %r." % (colname, me_ds, want, got)

	# check that both types of bytes filter correctly through typing
	jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict(
		column2type=dict(
			ascii_new="bytes",
			ascii_old="number", # fails on the string, so that gets filtered out everywhere
			bytes_new="bytes",
			bytes_old="bytes",
		),
		filter_bad=True,
	))
	ds = Dataset(jid)
	# verify the number first
	data_it = iter(raw_data)
	next(data_it) # skip the filtered out string
	for got in ds.iterate(0, "ascii_old"):
		want = next(data_it)
		if want is None:
			# Becomes 0 because the typer (unfortunately) sees it as an empty string
			want = 0
		assert want == got, "ascii_old in %s did not type correctly as number. Wanted %r but got %r." % (ds, want, got)
	# now verify all the bytes ones are ok, no longer containing the string.
	for colname in ("ascii_new", "bytes_new", "bytes_old",):
		data_it = iter(data)
		next(data_it) # skip the filtered out string
		for got in ds.iterate(0, colname):
			want = next(data_it)
			if want is not None:
				want = want.encode("ascii")
			assert want == got, "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)

	# and now check that the Nones are ok after making bytes from ascii and unicode from bytes.
	jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict(
		column2type=dict(
			ascii_new="bytes",
			ascii_old="bytes",
			bytes_new="unicode:ascii",
			bytes_old="unicode:ascii",
		),
	))
	ds = Dataset(jid)
	for colname in ("ascii_new", "ascii_old", "bytes_new", "bytes_old",):
		for want, got in ds.iterate(0, ["unicode_new", colname]):
			assert uni(want) == uni(got), "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)