Beispiel #1
0
def synthesis(prepare_res):
	opts = DotDict((k, v) for k, v in options.items() if k in a_csvimport.options)
	lst = prepare_res
	previous = datasets.previous
	msg = ProgressMsg(lst)
	with status('importing') as update:
		for fn, info, dsn in lst:
			update(msg.step('importing'))
			opts.filename = fn
			show_fn = '%s:%s' % (options.filename, info.filename,)
			ds = build('csvimport', options=opts, previous=previous, caption='Import of ' + show_fn).dataset()
			previous = ds.link_to_here(dsn, filename=show_fn)
			if options.chaining == 'off':
				previous = datasets.previous
	if (len(lst) == 1 or options.chaining != 'off') and dsn != 'default':
		ds.link_to_here('default', filename=show_fn)
Beispiel #2
0
def synthesis():
    sum = 0
    jobs = datasets.source.chain(length=options.chain_length,
                                 stop_ds=datasets.stop)
    for src in jobs:
        data = build('dataset_checksum',
                     columns=options.columns,
                     sort=options.sort,
                     source=src).load()
        sum ^= data.sum
    print("Total: %016x" % (sum, ))
    return DotDict(sum=sum,
                   columns=data.columns,
                   sort=options.sort,
                   sources=jobs)
Beispiel #3
0
def synthesis(prepare_res, analysis_res):
    separator, _, _, filename, _, labels, dw, bad_dw, skipped_dw, fds, success_fh, _, = prepare_res
    # Analysis may have gotten a perfectly legitimate EOF if something
    # went wrong in the reader process, so we need to check that all
    # went well.
    reader_res = []
    try:
        success_fh.seek(0)
        reader_res = success_fh.read()
    except OSError:
        pass
    if reader_res != b"\0":
        reader_res = reader_res.decode("utf-8", "replace").strip("\r\n \t\0")
        raise Exception(reader_res or "Reader process failed")
    success_fh.close()
    os.unlink("reader.success")
    good_counts = []
    bad_counts = []
    skipped_counts = []
    for sliceno, (good_count, bad_count,
                  skipped_count) in enumerate(analysis_res):
        dw.set_lines(sliceno, good_count)
        if bad_dw:
            bad_dw.set_lines(sliceno, bad_count)
        if skipped_dw:
            skipped_dw.set_lines(sliceno, skipped_count)
        good_counts.append(good_count)
        bad_counts.append(bad_count)
        skipped_counts.append(skipped_count)
    for dw in (
            bad_dw,
            skipped_dw,
            dw,
    ):
        if dw:
            dw.set_compressions("gzip")
    return DotDict(
        num_lines=sum(good_counts),
        lines_per_slice=good_counts,
        num_broken_lines=sum(bad_counts),
        broken_lines_per_slice=bad_counts,
        num_skipped_lines=sum(skipped_counts),
        skipped_lines_per_slice=skipped_counts,
    )
Beispiel #4
0
default is to include directories.
'''

from zipfile import ZipFile
from shutil import copyfileobj
from os.path import join
import re

from accelerator.compat import uni

from . import a_csvimport
from accelerator import DotDict, OptionEnum, build

depend_extra = (a_csvimport, )

options = DotDict(a_csvimport.options)
options.inside_filenames = {
}  # {"filename in zip": "dataset name"} or empty to import all files
options.chaining = OptionEnum('off on by_filename by_dsname').on
options.include_re = ""  # Regex of files to include. (Matches anywhere, use ^$ as needed.)
options.exclude_re = ""  # Regex of files to exclude, takes priority over include.
options.strip_dirs = False  # Strip directories from filename (a/b/c -> c)

datasets = ('previous', )


def namefix(d, name):
    ok = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz._-'
    name = ''.join(c if c in ok else '_' for c in uni(name))
    if name == 'default' and options.chaining != 'off':
        name = 'default_'