Ejemplo n.º 1
0
 def __new__(cls, jobid, name=None):
     if isinstance(jobid, (tuple, list)):
         jobid = _dsid(jobid)
     elif isinstance(jobid, dict):
         assert not name, "Don't pass both a separate name and jobid as {job: dataset}"
         assert len(jobid) == 1, "Only pass a single {job: dataset}"
         jobid, dsname = next(iteritems(jobid))
         if not jobid:
             return None
         jobid = job_params(jobid, default_empty=True).datasets.get(dsname)
         if not jobid:
             return None
     if '/' in jobid:
         assert not name, "Don't pass both a separate name and jobid as jid/name"
         jobid, name = jobid.split('/', 1)
     assert jobid, "If you really meant to use yourself as a dataset, pass params.jobid explicitly."
     name = uni(name or 'default')
     assert '/' not in name
     if name == 'default':
         suffix = ''
     else:
         suffix = '/' + name
     if jobid is _new_dataset_marker:
         from g import JOBID
         fullname = JOBID + suffix
     else:
         fullname = jobid + suffix
     obj = unicode.__new__(cls, fullname)
     obj.name = uni(name or 'default')
     if jobid is _new_dataset_marker:
         obj._data = DotDict({
             'version': (
                 2,
                 2,
             ),
             'filename': None,
             'hashlabel': None,
             'caption': '',
             'columns': {},
             'parent': None,
             'previous': None,
             'lines': [],
         })
         obj.jobid = None
     else:
         obj.jobid = jobid
         obj._data = DotDict(_ds_load(obj))
         assert obj._data.version[0] == 2 and obj._data.version[
             1] >= 2, "%s/%s: Unsupported dataset pickle version %r" % (
                 jobid,
                 name,
                 obj._data.version,
             )
         obj._data.columns = dict(obj._data.columns)
     return obj
Ejemplo n.º 2
0
							def run(jobidv, tlock):
								for jobid in jobidv:
									passed_cookie = None
									# This is not a race - all higher locks are locked too.
									while passed_cookie in job_tracking:
										passed_cookie = gen_cookie()
									job_tracking[passed_cookie] = DotDict(lock=JLock(), last_error=None, last_time=0)
									try:
										self.ctrl.run_job(jobid, subjob_cookie=passed_cookie, parent_pid=setup.get('parent_pid', 0))
										# update database since a new jobid was just created
										job = self.ctrl.add_single_jobid(jobid)
										with tlock:
											link2job[jobid]['make'] = 'DONE'
											link2job[jobid]['total_time'] = job.total
									except JobError as e:
										error.append([e.jobid, e.method, e.status])
										with tlock:
											link2job[jobid]['make'] = 'FAIL'
										return
									finally:
										del job_tracking[passed_cookie]
								# everything was built ok, update symlink
								try:
									wn = self.ctrl.current_workspace
									dn = self.ctrl.workspaces[wn].path
									ln = os.path.join(dn, wn + "-LATEST_")
									try:
										os.unlink(ln)
									except OSError:
										pass
									os.symlink(jobid, ln)
									os.rename(ln, os.path.join(dn, wn + "-LATEST"))
								except Exception:
									pass # meh
Ejemplo n.º 3
0
def synthesis(prepare_res, analysis_res):
	all = chain.from_iterable(analysis_res)
	if options.sort:
		all = sorted(all)
	res = md5(''.join(all)).hexdigest()
	print("%s: %s" % (datasets.source, res,))
	return DotDict(sum=int(res, 16), sort=options.sort, columns=prepare_res, source=datasets.source)
Ejemplo n.º 4
0
def params2defaults(params):
	d = DotDict()
	for key in ('datasets', 'jobids',):
		r = {}
		for v in params[key]:
			if isinstance(v, list):
				r[v[0]] = []
			else:
				r[v] = None
		d[key] = r
	def fixup(item):
		if isinstance(item, dict):
			d = {k: fixup(v) for k, v in iteritems(item)}
			if len(d) == 1 and first_value(d) is None and first_value(item) is not None:
				return {}
			return d
		if isinstance(item, (list, tuple, set,)):
			l = [fixup(v) for v in item]
			if l == [None] and list(item) != [None]:
				l = []
			return type(item)(l)
		if isinstance(item, type):
			return None
		assert isinstance(item, (bytes, unicode, int, float, long, bool, OptionEnum, NoneType, datetime.datetime, datetime.date, datetime.time, datetime.timedelta)), type(item)
		return item
	def fixup0(item):
		if isinstance(item, RequiredOption):
			item = item.value
		if isinstance(item, OptionDefault):
			item = item.default
		return fixup(item)
	d.options = {k: fixup0(v) for k, v in iteritems(params.options)}
	return d
Ejemplo n.º 5
0
 def finish(self, path, timestamp=None, caption=None):
     path = self._path(path)
     assert self._current, 'Tried to finish %s with nothing running' % (
         path, )
     assert path == self._current, 'Tried to finish %s while running %s' % (
         path,
         self._current,
     )
     user, automata = path.split('/')
     self._current = None
     caption = caption or self._current_caption or ''
     timestamp = timestamp or self._current_timestamp
     assert timestamp, 'No timestamp specified in begin or finish for %s' % (
         path, )
     data = DotDict(
         user=user,
         automata=automata,
         joblist=self.joblist,
         deps=self._deps,
         caption=caption,
         timestamp=timestamp,
     )
     if self._update:
         data.flags = ['update']
     url = self._url + '/add'
     return self._call(url, data)
Ejemplo n.º 6
0
	def run_job(self, jobid, subjob_cookie=None, parent_pid=0):
		W = self.workspaces[get_workspace_name(jobid)]
		#
		active_workspaces = {}
		for name in self.source_workdirs:
			active_workspaces[name] = self.workspaces[name].get_path()
		slices = self.workspaces[self.target_workdir].get_slices()

		t0 = time.time()
		setup = update_setup(jobid, starttime=t0)
		prof = setup.profile or DotDict()
		new_prof, files, subjobs = dispatch.launch(W.path, setup, self.config, self.Methods, active_workspaces, slices, self.debug, self.daemon_url, subjob_cookie, parent_pid)
		if self.debug:
			delete_from = Temp.TEMP
		else:
			delete_from = Temp.DEBUG
		for filename, temp in list(files.items()):
			if temp >= delete_from:
				unlink(join(W.path, jobid, filename))
				del files[filename]
		prof.update(new_prof)
		prof.total = 0
		prof.total = sum(v for v in prof.values() if isinstance(v, (float, int)))
		data = dict(
			starttime=t0,
			endtime=time.time(),
			profile=prof,
		)
		update_setup(jobid, **data)
		data['files'] = files
		data['subjobs'] = subjobs
		json_save(data, resolve_jobid_filename(jobid, 'post.json'))
Ejemplo n.º 7
0
def empty_spilldata(spill_ds='default'):
	return DotDict(
		version     = 1,
		counter     = 0,
		spill_ds    = spill_ds,
		last_time   = False,
		seen_before = False,
	)
Ejemplo n.º 8
0
def real_synthesis(params, options, datasets, minmax_index, prepare_res, we_have_spill, save_discard=False):
	stats = DotDict(
		included_lines          = [0] * params.slices,
		discarded_lines         = [0] * params.slices,
		spilled_lines           = [0] * params.slices,
		virtually_spilled_lines = [0] * params.slices,
		split_date              = str(options.split_date) if options.split_date else None,
		discard_before_date     = str(options.discard_before_date) if options.discard_before_date else None,
	)
	minmax_per_slice = [{} for _ in range(params.slices)]
	def update_stats(data):
		for item in data.itervalues():
			stats.included_lines[sliceno] += item.counters[2]
			stats.discarded_lines[sliceno] += item.counters[1]
			if item.virtual_spill:
				stats.virtually_spilled_lines[sliceno] += item.counters[3]
			else:
				stats.spilled_lines[sliceno] += item.counters[3]
			update_minmax(minmax_per_slice[sliceno], item.minmax)
	def update_minmax(dest, src):
		for name, lst0 in src.iteritems():
			lst1 = dest.get(name, lst0)
			mins = map(min, zip(lst0[:3], lst1[:3]))
			maxs = map(max, zip(lst0[3:], lst1[3:]))
			dest[name] = mins + maxs
	for sliceno in range(params.slices):
		update_stats(blob.load('stats', sliceno=sliceno))
	minmax = {}
	for item in minmax_per_slice:
		update_minmax(minmax, item)
	def minmax_select(offset, stringify=False):
		d = {}
		for k, v in minmax.iteritems():
			mn = v[offset]
			mx = v[3 + offset]
			if mn <= mx:
				if stringify and isinstance(mn, (date, time,)):
					d[k] = [str(mn), str(mx)]
				else:
					d[k] = [mn, mx]
		return d
	dw, dw_spill = prepare_res[:2]
	dw.set_minmax(None, minmax_select(minmax_index))
	dw_spill.set_minmax(None, minmax_select(2))
	if save_discard:
		included_lines = stats.discarded_lines
	else:
		included_lines = stats.included_lines
	for sliceno in range(params.slices):
		dw.set_lines(sliceno, included_lines[sliceno])
		dw_spill.set_lines(sliceno, stats.spilled_lines[sliceno])
	if not we_have_spill:
		dw_spill.discard()
	stats.minmax_discarded = minmax_select(0, True)
	stats.minmax           = minmax_select(1, True)
	stats.minmax_spilled   = minmax_select(2, True)
	json_save(stats)
Ejemplo n.º 9
0
def synthesis(prepare_res, params):
    ds = DotDict()
    # Must be finished in order (.previous must be finished when .finish is called.)
    for name, dw in sorted(prepare_res.items()):
        ds[name] = dw.finish()
    last = ds.h
    assert last.chain() == sorted(ds.values())
    ds.last = last
    test_partial_chains(ds)
    test_filters(ds)
Ejemplo n.º 10
0
def synthesis(analysis_res):
    opts = DotDict(options)
    del opts.inside_filenames
    lst = analysis_res.merge_auto()
    for fn, dsn in lst:
        opts.filename = fn
        jid = subjobs.build('csvimport', options=opts)
        unlink(fn)
        Dataset(jid).link_to_here(dsn)
    if len(lst) == 1 and dsn != 'default':
        Dataset(jid).link_to_here('default')
Ejemplo n.º 11
0
def _urd_typeify(d):
    if isinstance(d, str):
        d = json.loads(d)
        if not d or isinstance(d, unicode):
            return d
    res = DotDict(_default=lambda: None)
    for k, v in d.items():
        if k == 'joblist':
            v = JobList(v)
        elif isinstance(v, dict):
            v = _urd_typeify(v)
        res[k] = v
    return res
Ejemplo n.º 12
0
def _v2_columntypefix(ds):
    if ds.version[0] == 3:
        return ds
    assert ds.version[0] == 2 and ds.version[
        1] >= 2, "%s: Unsupported dataset pickle version %r" % (
            ds,
            ds.version,
        )
    ds = DotDict(ds)
    ds.columns = {name: _dc_v2to3(dc) for name, dc in ds.columns.items()}
    if 'cache' in ds:
        ds.cache = [(k, _v2_columntypefix(v)) for k, v in ds.cache]
    ds.version = (3, 0)
    return ds
def synthesis():
    sum = 0
    jobs = datasets.source.chain(length=options.chain_length,
                                 stop_jobid=datasets.stop)
    for src in jobs:
        jid = build('dataset_checksum',
                    options=dict(columns=options.columns, sort=options.sort),
                    datasets=dict(source=src))
        data = blob.load(jobid=jid)
        sum ^= data.sum
    print("Total: %016x" % (sum, ))
    return DotDict(sum=sum,
                   columns=data.columns,
                   sort=options.sort,
                   sources=jobs)
Ejemplo n.º 14
0
def read_method_conf(filename):
	""" read and parse the methods.conf file """
	db = {}
	with open(filename) as fh:
		for lineno, line in enumerate(fh, 1):
			data = line.split('#')[0].split()
			if not data:
				continue
			method = data.pop(0)
			try:
				version = data.pop(0)
			except IndexError:
				version = 'py'
			if not version.startswith('py') or data:
				raise Exception('Trailing garbage on %s:%d: %s' % (filename, lineno, line,))
			db[method] = DotDict(version=version)
	return db
Ejemplo n.º 15
0
def prepare(params):
    assert params.slices >= 2, "Hashing won't do anything with just one slice"
    dws = DotDict()
    for name, hashlabel in (
        ("unhashed_manual", None),  # manually interlaved
        ("unhashed_split", None),  # split_write interlaved
        ("up_checked", "up"),  # hashed on up using dw.hashcheck
        ("up_split", "up"),  # hashed on up using split_write
        ("down_checked", "down"),  # hashed on down using dw.hashcheck
        ("down_discarded", "down"),  # hashed on down using discarding writes
        ("down_discarded_list",
         "down"),  # hashed on down using discarding list writes
        ("down_discarded_dict",
         "down"),  # hashed on down using discarding dict writes
    ):
        dw = DatasetWriter(name=name, hashlabel=hashlabel)
        dw.add("up", "int32")
        dw.add("down", "int32")
        dws[name] = dw
    return dws
Ejemplo n.º 16
0
from compat import unicode

from extras import json_encode, json_decode, DotDict
from dispatch import JobError
from status import statmsg_sink, children, print_status_stacks, status_stacks_export

DEBUG_WRITE_JSON = False


def gen_cookie(size=16):
    return ''.join(random.choice(ascii_letters) for _ in range(size))


# This contains cookie: {lock, last_error, last_time} for all jobs, main jobs have cookie None.
job_tracking = {None: DotDict(lock=JLock(), last_error=None, last_time=0)}


# This needs .ctrl to work. It is set from main()
class XtdHandler(BaseWebHandler):
    server_version = "scx/0.1"
    DEBUG = not True

    def log_message(self, format, *args):
        return

    def encode_body(self, body):
        if isinstance(body, bytes):
            return body
        if isinstance(body, unicode):
            return body.encode('utf-8')
Ejemplo n.º 17
0
    def _handle_req(self, path, args):
        if path[0] == 'status':
            data = job_tracking.get(args.get('subjob_cookie') or None)
            if not data:
                self.do_response(500, 'text/plain', 'bad subjob_cookie!\n')
                return
            timeout = min(float(args.get('timeout', 0)), 128)
            status = DotDict(idle=data.lock.acquire(False))
            deadline = time.time() + timeout
            while not status.idle and time.time() < deadline:
                time.sleep(0.1)
                status.idle = data.lock.acquire(False)
            if status.idle:
                if data.last_error:
                    status.last_error = data.last_error
                    data.last_error = None
                else:
                    status.last_time = data.last_time
                data.lock.release()
            elif path == ['status', 'full']:
                status.status_stacks, status.current = status_stacks_export()
            self.do_response(200, "text/json", status)
            return

        elif path == ['list_workspaces']:
            ws = {k: v.path for k, v in self.ctrl.list_workspaces().items()}
            self.do_response(200, "text/json", ws)

        elif path == ['config']:
            self.do_response(200, "text/json", self.ctrl.config)

        elif path == ['update_methods']:
            self.do_response(200, "text/json", self.ctrl.update_methods())

        elif path == ['methods']:
            """ return a json with everything the Method object knows about the methods """
            self.do_response(200, "text/json", self.ctrl.get_methods())

        elif path[0] == 'method_info':
            method = path[1]
            self.do_response(200, "text/json", self.ctrl.method_info(method))

        elif path[0] == 'workspace_info':
            self.do_response(200, 'text/json',
                             self.ctrl.get_workspace_details())

        elif path[0] == 'abort':
            tokill = list(children)
            print('Force abort', tokill)
            for child in tokill:
                os.killpg(child, signal.SIGKILL)
            self.do_response(200, 'text/json', {'killed': len(tokill)})

        elif path == ['submit']:
            if self.ctrl.broken:
                self.do_response(
                    500, "text/json", {
                        'broken':
                        self.ctrl.broken,
                        'error':
                        'Broken methods: ' + ', '.join(
                            sorted(
                                m.split('.')[-1][2:]
                                for m in self.ctrl.broken))
                    })
            elif 'xml' in args:
                self.do_response(500, 'text/plain', 'JSON > XML!\n')
            elif 'json' in args:
                if DEBUG_WRITE_JSON:
                    with open('DEBUG_WRITE.json', 'wb') as fh:
                        fh.write(args['json'])
                setup = json_decode(args['json'])
                data = job_tracking.get(setup.get('subjob_cookie') or None)
                if not data:
                    self.do_response(500, 'text/plain', 'bad subjob_cookie!\n')
                    return
                if len(job_tracking) - 1 > 5:  # max five levels
                    print('Too deep subjob nesting!')
                    self.do_response(500, 'text/plain',
                                     'Too deep subjob nesting')
                    return
                if data.lock.acquire(False):
                    respond_after = True
                    try:
                        if self.DEBUG:
                            print('@daemon.py:  Got the lock!',
                                  file=sys.stderr)
                        jobidv, job_res = self.ctrl.initialise_jobs(setup)
                        job_res['done'] = False
                        if jobidv:
                            error = []
                            tlock = TLock()
                            link2job = {
                                j['link']: j
                                for j in job_res['jobs'].values()
                            }

                            def run(jobidv, tlock):
                                for jobid in jobidv:
                                    passed_cookie = None
                                    # This is not a race - all higher locks are locked too.
                                    while passed_cookie in job_tracking:
                                        passed_cookie = gen_cookie()
                                    job_tracking[passed_cookie] = DotDict(
                                        lock=JLock(),
                                        last_error=None,
                                        last_time=0)
                                    try:
                                        self.ctrl.run_job(
                                            jobid,
                                            subjob_cookie=passed_cookie,
                                            parent_pid=setup.get(
                                                'parent_pid', 0))
                                        # update database since a new jobid was just created
                                        job = self.ctrl.add_single_jobid(jobid)
                                        with tlock:
                                            link2job[jobid]['make'] = 'DONE'
                                            link2job[jobid][
                                                'total_time'] = job.total
                                    except JobError as e:
                                        error.append(
                                            [e.jobid, e.method, e.status])
                                        with tlock:
                                            link2job[jobid]['make'] = 'FAIL'
                                        return
                                    finally:
                                        del job_tracking[passed_cookie]
                                # everything was built ok, update symlink
                                try:
                                    wn = self.ctrl.target_workdir
                                    dn = self.ctrl.workspaces[wn].path
                                    ln = os.path.join(dn, wn + "-LATEST_")
                                    try:
                                        os.unlink(ln)
                                    except OSError:
                                        pass
                                    os.symlink(jobid, ln)
                                    os.rename(ln,
                                              os.path.join(dn, wn + "-LATEST"))
                                except OSError:
                                    traceback.print_exc()

                            t = Thread(target=run,
                                       name="job runner",
                                       args=(
                                           jobidv,
                                           tlock,
                                       ))
                            t.daemon = True
                            t.start()
                            t.join(2)  # give job two seconds to complete
                            with tlock:
                                for j in link2job.values():
                                    if j['make'] in (
                                            True,
                                            'FAIL',
                                    ):
                                        respond_after = False
                                        job_res_json = json_encode(job_res)
                                        break
                            if not respond_after:  # not all jobs are done yet, give partial response
                                self.do_response(200, "text/json",
                                                 job_res_json)
                            t.join()  # wait until actually complete
                            del tlock
                            del t
                            # verify that all jobs got built.
                            total_time = 0
                            for j in link2job.values():
                                jobid = j['link']
                                if j['make'] == True:
                                    # Well, crap.
                                    error.append([
                                        jobid, "unknown", {
                                            "INTERNAL": "Not built"
                                        }
                                    ])
                                    print("INTERNAL ERROR IN JOB BUILDING!",
                                          file=sys.stderr)
                                total_time += j.get('total_time', 0)
                            data.last_error = error
                            data.last_time = total_time
                    except Exception as e:
                        if respond_after:
                            self.do_response(500, "text/json",
                                             {'error': str(e)})
                        raise
                    finally:
                        data.lock.release()
                    if respond_after:
                        job_res['done'] = True
                        self.do_response(200, "text/json", job_res)
                    if self.DEBUG:
                        print("@daemon.py:  Process releases lock!",
                              file=sys.stderr
                              )  # note: has already done http response
                else:
                    self.do_response(200, 'text/plain',
                                     'Busy doing work for you...\n')
            else:
                self.do_response(500, 'text/plain', 'Missing json input!\n')
        else:
            self.do_response(500, 'text/plain', 'Unknown path\n')
            return
Ejemplo n.º 18
0
def load_methods(data):
    res_warnings = []
    res_failed = []
    res_hashes = {}
    res_params = {}
    for package, key in data:
        filename = '%s/a_%s.py' % (
            package,
            key,
        )
        modname = '%s.a_%s' % (package, key)
        try:
            with open(filename, 'rb') as fh:
                src = fh.read()
            tar_fh = io.BytesIO()
            tar_o = tarfile.open(mode='w:gz', fileobj=tar_fh, compresslevel=1)
            tar_o.add(filename, arcname='a_%s.py' % (key, ))
            h = hashlib.sha1(src)
            hash = int(h.hexdigest(), 16)
            mod = import_module(modname)
            prefix = os.path.dirname(mod.__file__) + '/'
            likely_deps = set()
            for k in dir(mod):
                v = getattr(mod, k)
                if isinstance(v, ModuleType):
                    dep = getattr(v, '__file__', '')
                    if dep.startswith(prefix):
                        dep = os.path.basename(dep)
                        if dep[-4:] in (
                                '.pyc',
                                '.pyo',
                        ):
                            dep = dep[:-1]
                        likely_deps.add(dep)
            hash_extra = 0
            for dep in getattr(mod, 'depend_extra', ()):
                if isinstance(dep, ModuleType):
                    dep = dep.__file__
                    if dep[-4:] in (
                            '.pyc',
                            '.pyo',
                    ):
                        dep = dep[:-1]
                if isinstance(dep, str):
                    if not dep.startswith('/'):
                        dep = prefix + dep
                    with open(dep, 'rb') as fh:
                        hash_extra ^= int(
                            hashlib.sha1(fh.read()).hexdigest(), 16)
                    bn = os.path.basename(dep)
                    likely_deps.discard(bn)
                    tar_o.add(dep, arcname=bn)
                else:
                    raise Exception('Bad depend_extra in %s.a_%s: %r' % (
                        package,
                        key,
                        dep,
                    ))
            for dep in likely_deps:
                res_warnings.append(
                    '%s.a_%s should probably depend_extra on %s' % (
                        package,
                        key,
                        dep[:-3],
                    ))
            res_hashes[key] = ("%040x" % (hash ^ hash_extra, ), )
            res_params[key] = params = DotDict()
            for name, default in (
                (
                    'options',
                    {},
                ),
                (
                    'datasets',
                    (),
                ),
                (
                    'jobids',
                    (),
                ),
            ):
                params[name] = getattr(mod, name, default)
            equivalent_hashes = getattr(mod, 'equivalent_hashes', ())
            if equivalent_hashes:
                assert isinstance(
                    equivalent_hashes,
                    dict), 'Read the docs about equivalent_hashes'
                assert len(equivalent_hashes
                           ) == 1, 'Read the docs about equivalent_hashes'
                k, v = next(iteritems(equivalent_hashes))
                assert isinstance(k,
                                  str), 'Read the docs about equivalent_hashes'
                assert isinstance(
                    v, tuple), 'Read the docs about equivalent_hashes'
                for v in v:
                    assert isinstance(
                        v, str), 'Read the docs about equivalent_hashes'
                start = src.index('equivalent_hashes')
                end = src.index('}', start)
                h = hashlib.sha1(src[:start])
                h.update(src[end:])
                verifier = "%040x" % (int(h.hexdigest(), 16) ^ hash_extra, )
                if verifier in equivalent_hashes:
                    res_hashes[key] += equivalent_hashes[verifier]
                else:
                    res_warnings.append(
                        '%s.a_%s has equivalent_hashes, but missing verifier %s'
                        % (
                            package,
                            key,
                            verifier,
                        ))
            tar_o.close()
            tar_fh.seek(0)
            archives[key] = tar_fh.read()
        except Exception:
            print_exc()
            res_failed.append(modname)
            continue
    return res_warnings, res_failed, res_hashes, res_params
Ejemplo n.º 19
0
def synthesis(params, analysis_res, prepare_res):
    r = report()
    res = DotDict()
    d = datasets.source
    analysis_res = list(analysis_res)
    if options.filter_bad:
        num_lines_per_split = [
            num - data[1] for num, data in zip(d.lines, analysis_res)
        ]
        res.bad_line_count_per_slice = [data[1] for data in analysis_res]
        res.bad_line_count_total = sum(res.bad_line_count_per_slice)
        r.println('Slice   Bad line count')
        for sliceno, cnt in enumerate(res.bad_line_count_per_slice):
            r.println('%5d   %d' % (
                sliceno,
                cnt,
            ))
        r.println('total   %d' % (res.bad_line_count_total, ))
        r.line()
        r.println('Slice   Bad line number')
        reported_count = 0
        for sliceno, data in enumerate(analysis_res):
            fn = 'badmap%d' % (sliceno, )
            if data[1] and reported_count < 32:
                with open(fn, 'rb') as fh:
                    badmap = mmap(fh.fileno(), 0, prot=PROT_READ)
                    for ix, v in enumerate(imap(ord, badmap)):
                        if v:
                            for jx in range(8):
                                if v & (1 << jx):
                                    r.println('%5d   %d' % (
                                        sliceno,
                                        ix * 8 + jx,
                                    ))
                                    reported_count += 1
                                    if reported_count >= 32: break
                            if reported_count >= 32: break
                    badmap.close()
            unlink(fn)
        if reported_count >= 32:
            r.println('...')
        r.line()
        res.bad_line_count_per_column = {}
        r.println('Bad line count   Column')
        for colname in sorted(analysis_res[0][0]):
            cnt = sum(data[0][colname] for data in analysis_res)
            r.println('%14d   %s' % (
                cnt,
                colname,
            ))
            res.bad_line_count_per_column[colname] = cnt
        r.line()
    else:
        num_lines_per_split = d.lines
    dw = prepare_res
    for sliceno, count in enumerate(num_lines_per_split):
        dw.set_lines(sliceno, count)
    if options.defaults:
        r.println('Defaulted values')
        res.defaulted_per_slice = {}
        res.defaulted_total = {}
        for colname in sorted(options.defaults):
            r.println('    %s:' % (colname, ))
            r.println('        Slice   Defaulted line count')
            res.defaulted_per_slice[colname] = [
                data[2][colname] for data in analysis_res
            ]
            res.defaulted_total[colname] = sum(
                res.defaulted_per_slice[colname])
            for sliceno, cnt in enumerate(res.defaulted_per_slice[colname]):
                r.println('        %5d   %d' % (
                    sliceno,
                    cnt,
                ))
            r.println('        total   %d' % (res.defaulted_total[colname], ))
        r.line()
    for sliceno, data in enumerate(analysis_res):
        dw.set_minmax(sliceno, data[3])
    d = dw.finish()
    res.good_line_count_per_slice = num_lines_per_split
    res.good_line_count_total = sum(num_lines_per_split)
    r.line()
    r.println('Total of %d lines converted' % (res.good_line_count_total, ))
    r.close()
    json_save(res)
Ejemplo n.º 20
0
	def as_dep(self):
		return DotDict(timestamp=self.timestamp, joblist=self.joblist, caption=self.caption, _default=lambda: None)
Ejemplo n.º 21
0
def execute_process(workdir,
                    jobid,
                    slices,
                    result_directory,
                    common_directory,
                    source_directory,
                    index=None,
                    workspaces=None,
                    daemon_url=None,
                    subjob_cookie=None,
                    parent_pid=0):
    g.JOBID = jobid
    setproctitle('launch')
    path = os.path.join(workdir, jobid)
    try:
        os.chdir(path)
    except Exception:
        print("Cannot cd to workdir", path)
        exit(1)

    g.params = params = job_params()
    method_ref = import_module(params.package + '.a_' + params.method)
    g.sliceno = -1

    if workspaces:
        jobid_module.put_workspaces(workspaces)

    def maybe_dataset(v):
        if isinstance(v, list):
            return [maybe_dataset(e) for e in v]
        if not v:
            return ''
        try:
            return dataset.Dataset(v)
        except IOError:
            return v

    datasets = DotDict(
        {k: maybe_dataset(v)
         for k, v in params.datasets.items()})

    g.options = params.options
    g.datasets = datasets
    g.jobids = params.jobids

    method_ref.options = params.options
    method_ref.datasets = datasets
    method_ref.jobids = params.jobids

    # compatibility names
    g.SLICES = slices
    g.JOBID = jobid
    g.jobid = jobid
    g.METHOD = params.method
    g.WORKSPACEPATH = workdir
    g.CAPTION = params.caption
    g.PACKAGE = params.package
    g.RESULT_DIRECTORY = result_directory
    g.COMMON_DIRECTORY = common_directory
    g.SOURCE_DIRECTORY = source_directory
    g.index = -1

    g.daemon_url = daemon_url
    g.running = 'launch'
    status._start('%s %s' % (
        jobid,
        params.method,
    ), parent_pid)

    def dummy():
        pass

    prepare_func = getattr(method_ref, 'prepare', dummy)
    analysis_func = getattr(method_ref, 'analysis', dummy)
    synthesis_func = getattr(method_ref, 'synthesis', dummy)

    synthesis_needs_analysis = 'analysis_res' in getarglist(synthesis_func)

    # A chain must be finished from the back, so sort on that.
    sortnum_cache = {}

    def dw_sortnum(name):
        if name not in sortnum_cache:
            dw = dataset._datasetwriters[name]
            if dw.previous and dw.previous.startswith(jobid + '/'):
                pname = dw.previous.split('/')[1]
                num = dw_sortnum(pname) + 1
            else:
                num = 0
            sortnum_cache[name] = num
        return sortnum_cache[name]

    prof = {}
    if prepare_func is dummy:
        prof['prepare'] = 0  # truthish!
    else:
        t = time()
        g.running = 'prepare'
        g.subjob_cookie = subjob_cookie
        setproctitle(g.running)
        with status.status(g.running):
            g.prepare_res = method_ref.prepare(**args_for(method_ref.prepare))
            to_finish = [
                dw.name for dw in dataset._datasetwriters.values()
                if dw._started
            ]
            if to_finish:
                with status.status("Finishing datasets"):
                    for name in sorted(to_finish, key=dw_sortnum):
                        dataset._datasetwriters[name].finish()
        prof['prepare'] = time() - t
    setproctitle('launch')
    from extras import saved_files
    if analysis_func is dummy:
        prof['per_slice'] = []
        prof['analysis'] = 0
    else:
        t = time()
        g.running = 'analysis'
        g.subjob_cookie = None  # subjobs are not allowed from analysis
        with status.status(
                'Waiting for all slices to finish analysis') as update:
            g.update_top_status = update
            prof['per_slice'], files, g.analysis_res = fork_analysis(
                slices, analysis_func, args_for(analysis_func),
                synthesis_needs_analysis)
            del g.update_top_status
        prof['analysis'] = time() - t
        saved_files.update(files)
    t = time()
    g.running = 'synthesis'
    g.subjob_cookie = subjob_cookie
    setproctitle(g.running)
    with status.status(g.running):
        synthesis_res = synthesis_func(**args_for(synthesis_func))
        if synthesis_res is not None:
            blob.save(synthesis_res, temp=False)
        if dataset._datasetwriters:
            with status.status("Finishing datasets"):
                for name in sorted(dataset._datasetwriters, key=dw_sortnum):
                    dataset._datasetwriters[name].finish()
    t = time() - t
    prof['synthesis'] = t

    from subjobs import _record
    status._end()
    return None, (prof, saved_files, _record)
Ejemplo n.º 22
0
def statmsg_sink(logfilename, sock):
    from extras import DotDict
    print('write log to "%s".' % (logfilename, ))
    with open(logfilename, 'w', encoding='utf-8') as fh:
        ix = 0
        while True:
            data = None
            try:
                data = sock.recv(1500)
                typ, pid, msg = data.decode('utf-8').split('\0', 2)
                pid = int(pid)
                with status_stacks_lock:
                    if typ == 'push':
                        msg, t, cookie = msg.split('\0', 3)
                        t = float(t)
                        status_all[pid].stack.append((msg, t, cookie))
                    elif typ == 'pop':
                        stack, ix = _find(pid, msg)
                        if ix == len(stack) - 1:
                            stack.pop()
                        else:
                            print(
                                'POP OF WRONG STATUS: %d:%s (index %s of %d)' %
                                (pid, msg, ix, len(stack)))
                    elif typ == 'update':
                        msg, _, cookie = msg.split('\0', 3)
                        stack, ix = _find(pid, cookie)
                        if ix is None:
                            print('UPDATE TO UNKNOWN STATUS %d:%s: %s' %
                                  (pid, cookie, msg))
                        else:
                            stack[ix] = (msg, stack[ix][1], cookie)
                    elif typ == 'start':
                        parent_pid, is_analysis, msg, t = msg.split('\0', 3)
                        parent_pid = int(parent_pid)
                        t = float(t)
                        d = DotDict(_default=None)
                        d.parent_pid = parent_pid
                        d.children = {}
                        d.stack = [(msg, t, None)]
                        d.summary = (
                            t,
                            msg,
                            t,
                        )
                        if parent_pid in status_all:
                            if is_analysis:
                                msg, parent_t, _ = status_all[
                                    parent_pid].stack[0]
                                d.summary = (
                                    parent_t,
                                    msg + ' analysis',
                                    t,
                                )
                            status_all[parent_pid].children[pid] = d
                        else:
                            status_tree[pid] = d
                        status_all[pid] = d
                        del d
                    elif typ == 'end':
                        d = status_all.get(pid)
                        if d:
                            if d.parent_pid in status_all:
                                p = status_all[d.parent_pid]
                                if pid in p.children:
                                    del p.children[pid]
                                del p
                            del d
                        if pid in status_tree:
                            del status_tree[pid]
                    elif typ == 'statmsg':
                        fh.write('%s %5d: %s\n' % (
                            strftime("%Y-%m-%d %H:%M:%S"),
                            ix,
                            msg,
                        ))
                        fh.flush()
                        ix += 1
                    else:
                        print('UNKNOWN MESSAGE: %r' % (data, ))
            except Exception:
                print('Failed to process %r:' % (data, ))
                print_exc()
Ejemplo n.º 23
0
'''

from zipfile import ZipFile
from shutil import copyfileobj
from os import unlink

from compat import uni

from . import a_csvimport
from extras import DotDict, resolve_jobid_filename
import subjobs
from dataset import Dataset

depend_extra = (a_csvimport, )

options = DotDict(a_csvimport.options)
options.inside_filenames = {
}  # {"filename in zip": "dataset name"} or empty to import all files


def namefix(d, name):
    ok = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.'
    name = ''.join(c if c in ok else '_' for c in uni(name))
    while name in d:
        name += '_'
    return name


def prepare(params):
    def tmpfn():
        cnt = 0
Ejemplo n.º 24
0
def process_one(sliceno, options, source, prepare_res, data=None, save_discard=False):
	# Future improvement: Look at the old minmax to determine if we will get anything from reading this data
	dw, dw_spill, column_names, column_sizes, column_types, minmax_typeidx = prepare_res
	if data:
		assert data.version == 1
		data.seen_before = True
	else:
		data = empty_spilldata()
	d = Dataset(source, data.spill_ds)
	in_files = []
	out_files = []
	offsets = []
	if not save_discard:
		out_files += [ffi.NULL] * len(column_names) # don't save "too old" lines
	minmax_files = []
	minmax_d = {}
	for colname in column_names:
		out_fn = dw.column_filename(colname, sliceno).encode('ascii')
		in_fn = d.column_filename(colname, sliceno).encode('ascii')
		offset = d.columns[colname].offsets[sliceno] if d.columns[colname].offsets else 0
		in_files.append(ffi.new('char []', in_fn))
		out_files.append(ffi.new('char []', out_fn))
		offsets.append(offset)
		minmax_fn = out_fn + '_minmax'
		minmax_files.append(ffi.new('char []', minmax_fn))
		minmax_d[colname] = minmax_fn
	if save_discard:
		out_files += [ffi.NULL] * len(column_names) # don't save "good" lines (save discard instead)
	date_coltype = column_types[options.date_column]
	def date2cfmt(dt):
		if date_coltype == 'datetime':
			date0 = (dt.year << 14) | (dt.month << 10) | (dt.day << 5) | dt.hour
			date1 = (dt.minute << 26) | (dt.second << 20) | dt.microsecond
		elif date_coltype == 'date':
			date0 = (dt.year << 9) | (dt.month << 5) | dt.day
			date1 = 0
		elif date_coltype == 'time':
			date0 = 32277536 | dt.hour
			date1 = (dt.minute << 26) | (dt.second << 20) | dt.microsecond
		else:
			raise Exception('Bad date_coltype type: ' + date_coltype)
		return date0, date1
	dates = [0, 0, 0, 0, 0xffffffff, 0xffffffff]
	stats = DotDict()
	if data.seen_before:
		dates[0:2] = date2cfmt(data.get('process_date', datetime.min))
	if (data.last_time or options.hard_spill) and not save_discard:
		for colname in column_names:
			out_fn = dw_spill.column_filename(colname, sliceno).encode('ascii')
			out_files.append(ffi.new('char []', out_fn))
		stats.virtual_spill = False
	else:
		# We still have to make sure the files exist, or we end up
		# with a broken dataset if only some slices wanted to spill.
		for colname in column_names:
			open(dw_spill.column_filename(colname, sliceno), 'ab').close()
		out_files += [ffi.NULL] * len(column_names)
		stats.virtual_spill = True
	# We are done reading `data` - update it for next iteration
	del data.seen_before
	data.process_date = datetime.min
	if options.discard_before_date:
		if options.split_date:
			assert options.discard_before_date < options.split_date
		dates[2:3] = date2cfmt(options.discard_before_date)
		data.process_date = options.discard_before_date
	if options.split_date:
		dates[4:6] = date2cfmt(options.split_date)
		data.process_date = max(data.process_date, options.split_date)
	counters = ffi.new('uint64_t [4]') # one for each class-enum
	res = backend.filter(len(in_files), in_files, offsets, out_files, minmax_files, column_sizes, counters, dates, minmax_typeidx, d.lines[sliceno])
	assert not res, "cffi converter returned error on data from " + source
	stats.version = 0
	stats.counters = list(counters)
	stats.minmax = {}
	for colname, fn in minmax_d.iteritems():
		if exists(fn):
			with type2iter[column_types[colname]](fn) as it:
				stats.minmax[colname] = list(it)
			unlink(fn)
	# If there is at most 2% left, spill it next time.
	# Or if there is at most 10% left and we have read it at least 8 times.
	# Or if there is at most 20% left and we have read it at least 16 times.
	# A reasonable balance between re-reading and re-writing, one hopes.
	data.counter += 1
	total_lines = sum(counters)
	data.last_time = (counters[3] <= total_lines / 50 or
		(data.counter >= 8 and counters[3] <= total_lines / 10) or
		(data.counter >= 16 and counters[3] <= total_lines / 5)
	)
	# If no lines were spilled we will not need this dataset again,
	# nor if we wrote the spill in this dataset.
	if not counters[3] or not stats.virtual_spill:
		data = None
	return data, stats