Exemple #1
0
 def _save(self):
     if not os.path.exists(self.name):
         os.mkdir(self.name)
     blob.save(self._data, self._name('pickle'), temp=False)
     with open(self._name('txt'), 'w', encoding='utf-8') as fh:
         nl = False
         if self.hashlabel:
             fh.write('hashlabel %s\n' % (self.hashlabel, ))
             nl = True
         if self.previous:
             fh.write('previous %s\n' % (self.previous, ))
             nl = True
         if nl:
             fh.write('\n')
         col_list = sorted((
             k,
             c.type,
             c.location,
         ) for k, c in self.columns.items())
         lens = tuple(
             max(minlen, max(len(t[i]) for t in col_list))
             for i, minlen in ((0, 4), (1, 4), (2, 8)))
         template = '%%%ds  %%%ds  %%-%ds\n' % lens
         fh.write(template % ('name', 'type', 'location'))
         fh.write(template % tuple('=' * l for l in lens))
         for t in col_list:
             fh.write(template % t)
def analysis(sliceno, prepare_res):
    stats = {}
    prev_spilldata = blob.load('spilldata',
                               jobid=datasets.source,
                               sliceno=sliceno)
    source_params = job_params(datasets.source)
    for source, data in prev_spilldata:
        _, stats[source] = a_dataset_datesplit.process_one(
            sliceno,
            source_params.options,
            source,
            prepare_res,
            data,
            save_discard=True)
    source_params = job_params(datasets.source)
    prev_params = job_params(source_params.datasets.previous,
                             default_empty=True)
    for source in Dataset(source_params.datasets.source).chain(
            stop_ds=prev_params.datasets.source):
        _, stats[source] = a_dataset_datesplit.process_one(
            sliceno,
            source_params.options,
            source,
            prepare_res,
            save_discard=True)
    blob.save(stats, 'stats', sliceno=sliceno, temp=False)
Exemple #3
0
def analysis(sliceno, params, prepare_res):
    spilldata = {}
    stats = {}
    we_have_spill = False
    if datasets.previous:
        prev_spilldata = blob.load('spilldata',
                                   jobid=datasets.previous,
                                   sliceno=sliceno)
        for source, data in prev_spilldata:
            spilldata[source], stats[source] = process_one(
                sliceno, options, source, prepare_res, data)
            we_have_spill |= not stats[source].virtual_spill
    if datasets.source:
        prev_params = job_params(datasets.previous, default_empty=True)
        for source in datasets.source.chain(
                stop_ds=prev_params.datasets.source):
            spilldata[source], stats[source] = process_one(
                sliceno, options, source, prepare_res)
            we_have_spill |= not stats[source].virtual_spill
    spilldata = [(k, v) for k, v in spilldata.iteritems() if v]
    if we_have_spill:
        spilldata.append((params.jobid, empty_spilldata('SPILL')))
    blob.save(spilldata, 'spilldata', sliceno=sliceno, temp=False)
    blob.save(stats, 'stats', sliceno=sliceno, temp=False)
    return we_have_spill
def call_analysis(analysis_func, sliceno_, q, preserve_result, parent_pid, **kw):
	try:
		status._start('analysis(%d)' % (sliceno_,), parent_pid, 't')
		os.close(_prof_fd)
		for stupid_inconsistent_name in ('sliceno', 'index'):
			if stupid_inconsistent_name in kw:
				kw[stupid_inconsistent_name] = sliceno_
			setattr(g, stupid_inconsistent_name, sliceno_)
		for dw in dataset._datasetwriters.values():
			if dw._for_single_slice is None:
				dw._set_slice(sliceno_)
		res = analysis_func(**kw)
		if preserve_result:
			# Remove defaultdicts until we find one with a picklable default_factory.
			# (This is what you end up doing manually anyway.)
			def picklable(v):
				try:
					pickle.dumps(v, pickle.HIGHEST_PROTOCOL)
					return True
				except Exception:
					return False
			def fixup(d):
				if isinstance(d, defaultdict) and not picklable(d.default_factory):
					if not d:
						return {}
					v = next(iteritems(d))
					if isinstance(v, defaultdict) and not picklable(v.default_factory):
						return {k: fixup(v) for k, v in iteritems(d)}
					else:
						return dict(d)
				else:
					return d
			def save(item, name):
				blob.save(fixup(item), name, sliceno=sliceno_, temp=True)
			if isinstance(res, tuple):
				if sliceno_ == 0:
					blob.save(len(res), "Analysis.tuple", temp=True)
				for ix, item in enumerate(res):
					save(item, "Analysis.%d." % (ix,))
			else:
				if sliceno_ == 0:
					blob.save(False, "Analysis.tuple", temp=True)
				save(res, "Analysis.")
		from extras import saved_files
		dw_lens = {}
		dw_minmax = {}
		for name, dw in dataset._datasetwriters.items():
			if dw._for_single_slice in (None, sliceno_,):
				dw.close()
				dw_lens[name] = dw._lens
				dw_minmax[name] = dw._minmax
		status._end()
		q.put((sliceno_, time(), saved_files, dw_lens, dw_minmax, None,))
	except:
		status._end()
		q.put((sliceno_, time(), {}, {}, {}, fmt_tb(1),))
		print_exc()
		sleep(5) # give launcher time to report error (and kill us)
		exitfunction()
def one_slice(sliceno):
    first = True
    updater = globals()['upd_' + options.flavour]
    for pickle in options.pickles:
        tmp = load(pickle, sliceno=sliceno)
        if first:
            res = tmp
            first = False
        else:
            updater(res, tmp)
    save(res, options.resultname, sliceno=sliceno)
Exemple #6
0
def synthesis(prepare_res, analysis_res, params):
    from math import sqrt

    separator, filename, orig_filename, labels, dw = prepare_res
    labels = [n for n in labels if n not in options.discard]

    if filename != orig_filename:
        os.unlink(filename)

    # aggregate typing and statistics
    res = {}
    res['num_broken_lines'] = 0
    res['num_lines'] = 0
    res['lines_per_slice'] = []
    for sliceno, tmp in enumerate(analysis_res):
        res['num_broken_lines'] += tmp['num_broken_lines']
        res['num_lines'] += tmp['num_lines']
        res['lines_per_slice'].append(tmp['num_lines'])
        dw.set_lines(sliceno, tmp['num_lines'])

    blob.save(res, 'import')

    # write report
    r = report.report()
    if not res['num_lines']:
        r.println('No lines read - empty file!')
        r.close()
        return
    r.println('Number of rows read\n')
    r.println('  slice                            lines')
    for sliceno, nlines in enumerate(res['lines_per_slice']):
        if res['num_lines']:
            r.println('    %2d                         %9d  (%6.2f%%)' %
                      (sliceno, nlines, 100 * nlines / res['num_lines']))
        else:
            r.println('    %2d                         %9d           ' %
                      (sliceno, nlines, 100 * nlines / res['num_lines']))
    r.println('  total                        %9d' % (res['num_lines'], ))
    stdev = sqrt(
        sum((x - res['num_lines'] / params.slices)**2
            for x in res['lines_per_slice']) / params.slices)
    r.println('\n  hash stdev                   %9d  (%6.2f%%)' %
              (stdev, round(100 * stdev / res['num_lines'])))
    r.line()

    r.println('Number of columns              %9d' % len(labels, ))
    r.close()

    if res['num_broken_lines'] and not options.allow_bad:
        raise Exception('%d bad lines without options.allow_bad' %
                        (res['num_broken_lines'], ))
Exemple #7
0
 def save_datastore(self):
     if self._key:
         return
     value = self._f.read()
     value = base64.b64encode(value)
     value = value + " " * (16 - len(value) % 16)
     value = self.aes.encrypt(value)
     self._key = blob.save(value)
Exemple #8
0
def analysis(sliceno, prepare_res):
    key_filter, value_filter = prepare_res
    d = blob.load(jobid=jobids.previous,
                  sliceno=sliceno,
                  default=defaultdict(set))
    if options.key_filter:
        d = {k: v for k, v in d.iteritems() if k in key_filter}
    iterator = datasets.source.iterate_chain(
        sliceno,
        (
            options.key_column,
            options.value_column,
        ),
        stop_jobid={jobids.previous: 'source'},
    )
    # These break out into four versions for shorter runtime
    if options.value_filter:
        # Remove anything that's not in the filter
        for k, v in d.items():
            v = v & value_filter
            if v:
                d[k] = v
            else:
                del d[k]
        # This lets us reuse the same str object for the same value (smaller pickles)
        value_filter = {v: v for v in value_filter}
        if options.key_filter:
            for k, v in iterator:
                if k in key_filter and v in value_filter:
                    d[k].add(value_filter[v])
        else:
            for k, v in iterator:
                if v in value_filter:
                    d[k].add(value_filter[v])
    else:
        reuse = {}
        if options.key_filter:
            for k, v in iterator:
                if k in key_filter:
                    d[k].add(reuse.setdefault(v, v))
        else:
            for k, v in iterator:
                d[k].add(reuse.setdefault(v, v))
    blob.save(d, sliceno=sliceno, temp=False)
    blob.save(set(d), 'keyset', sliceno=sliceno, temp=False)
    blob.save(Counter(len(v) for v in d.itervalues()),
              'setsizehist',
              sliceno=sliceno,
              temp=False)
Exemple #9
0
 def save(item, name):
     blob.save(fixup(item), name, sliceno=sliceno_, temp=True)
Exemple #10
0
def execute_process(workdir,
                    jobid,
                    slices,
                    result_directory,
                    common_directory,
                    source_directory,
                    index=None,
                    workspaces=None,
                    daemon_url=None,
                    subjob_cookie=None,
                    parent_pid=0):
    g.JOBID = jobid
    setproctitle('launch')
    path = os.path.join(workdir, jobid)
    try:
        os.chdir(path)
    except Exception:
        print("Cannot cd to workdir", path)
        exit(1)

    g.params = params = job_params()
    method_ref = import_module(params.package + '.a_' + params.method)
    g.sliceno = -1

    if workspaces:
        jobid_module.put_workspaces(workspaces)

    def maybe_dataset(v):
        if isinstance(v, list):
            return [maybe_dataset(e) for e in v]
        if not v:
            return ''
        try:
            return dataset.Dataset(v)
        except IOError:
            return v

    datasets = DotDict(
        {k: maybe_dataset(v)
         for k, v in params.datasets.items()})

    g.options = params.options
    g.datasets = datasets
    g.jobids = params.jobids

    method_ref.options = params.options
    method_ref.datasets = datasets
    method_ref.jobids = params.jobids

    # compatibility names
    g.SLICES = slices
    g.JOBID = jobid
    g.jobid = jobid
    g.METHOD = params.method
    g.WORKSPACEPATH = workdir
    g.CAPTION = params.caption
    g.PACKAGE = params.package
    g.RESULT_DIRECTORY = result_directory
    g.COMMON_DIRECTORY = common_directory
    g.SOURCE_DIRECTORY = source_directory
    g.index = -1

    g.daemon_url = daemon_url
    g.running = 'launch'
    status._start('%s %s' % (
        jobid,
        params.method,
    ), parent_pid)

    def dummy():
        pass

    prepare_func = getattr(method_ref, 'prepare', dummy)
    analysis_func = getattr(method_ref, 'analysis', dummy)
    synthesis_func = getattr(method_ref, 'synthesis', dummy)

    synthesis_needs_analysis = 'analysis_res' in getarglist(synthesis_func)

    # A chain must be finished from the back, so sort on that.
    sortnum_cache = {}

    def dw_sortnum(name):
        if name not in sortnum_cache:
            dw = dataset._datasetwriters[name]
            if dw.previous and dw.previous.startswith(jobid + '/'):
                pname = dw.previous.split('/')[1]
                num = dw_sortnum(pname) + 1
            else:
                num = 0
            sortnum_cache[name] = num
        return sortnum_cache[name]

    prof = {}
    if prepare_func is dummy:
        prof['prepare'] = 0  # truthish!
    else:
        t = time()
        g.running = 'prepare'
        g.subjob_cookie = subjob_cookie
        setproctitle(g.running)
        with status.status(g.running):
            g.prepare_res = method_ref.prepare(**args_for(method_ref.prepare))
            to_finish = [
                dw.name for dw in dataset._datasetwriters.values()
                if dw._started
            ]
            if to_finish:
                with status.status("Finishing datasets"):
                    for name in sorted(to_finish, key=dw_sortnum):
                        dataset._datasetwriters[name].finish()
        prof['prepare'] = time() - t
    setproctitle('launch')
    from extras import saved_files
    if analysis_func is dummy:
        prof['per_slice'] = []
        prof['analysis'] = 0
    else:
        t = time()
        g.running = 'analysis'
        g.subjob_cookie = None  # subjobs are not allowed from analysis
        with status.status(
                'Waiting for all slices to finish analysis') as update:
            g.update_top_status = update
            prof['per_slice'], files, g.analysis_res = fork_analysis(
                slices, analysis_func, args_for(analysis_func),
                synthesis_needs_analysis)
            del g.update_top_status
        prof['analysis'] = time() - t
        saved_files.update(files)
    t = time()
    g.running = 'synthesis'
    g.subjob_cookie = subjob_cookie
    setproctitle(g.running)
    with status.status(g.running):
        synthesis_res = synthesis_func(**args_for(synthesis_func))
        if synthesis_res is not None:
            blob.save(synthesis_res, temp=False)
        if dataset._datasetwriters:
            with status.status("Finishing datasets"):
                for name in sorted(dataset._datasetwriters, key=dw_sortnum):
                    dataset._datasetwriters[name].finish()
    t = time() - t
    prof['synthesis'] = t

    from subjobs import _record
    status._end()
    return None, (prof, saved_files, _record)
Exemple #11
0
def synthesis(params):
    setsizehist = Counter()
    for sliceno in range(params.slices):
        setsizehist.update(blob.load('setsizehist', sliceno=sliceno))
    blob.save(setsizehist, 'setsizehist')
Exemple #12
0
 def save_datastore(self):
     if self._key:
         return
     self.file.seek(0)
     self._key = blob.save(self.file.read())