コード例 #1
0
def analysis(sliceno, prepare_res):
    stats = {}
    prev_spilldata = blob.load('spilldata',
                               jobid=datasets.source,
                               sliceno=sliceno)
    source_params = job_params(datasets.source)
    for source, data in prev_spilldata:
        _, stats[source] = a_dataset_datesplit.process_one(
            sliceno,
            source_params.options,
            source,
            prepare_res,
            data,
            save_discard=True)
    source_params = job_params(datasets.source)
    prev_params = job_params(source_params.datasets.previous,
                             default_empty=True)
    for source in Dataset(source_params.datasets.source).chain(
            stop_ds=prev_params.datasets.source):
        _, stats[source] = a_dataset_datesplit.process_one(
            sliceno,
            source_params.options,
            source,
            prepare_res,
            save_discard=True)
    blob.save(stats, 'stats', sliceno=sliceno, temp=False)
コード例 #2
0
def prepare(params):
    d = datasets.source
    caption = options.caption % dict(caption=d.caption,
                                     hashlabel=options.hashlabel)
    prev_p = job_params(datasets.previous, default_empty=True)
    prev_source = prev_p.datasets.source
    if len(d.chain(stop_jobid=prev_source, length=options.length)) == 1:
        filename = d.filename
    else:
        filename = None
    dws = []
    previous = datasets.previous
    for sliceno in range(params.slices):
        if options.as_chain and sliceno == params.slices - 1:
            name = "default"
        else:
            name = str(sliceno)
        dw = DatasetWriter(
            caption="%s (slice %d)" % (caption, sliceno),
            hashlabel=options.hashlabel,
            filename=filename,
            previous=previous,
            name=name,
            for_single_slice=sliceno,
        )
        previous = (params.jobid, name)
        dws.append(dw)
    names = []
    for n, c in d.columns.items():
        # names has to be in the same order as the add calls
        # so the iterator returns the same order the writer expects.
        names.append(n)
        for dw in dws:
            dw.add(n, c.type)
    return dws, names, prev_source, caption, filename
コード例 #3
0
def analysis(sliceno, params, prepare_res):
    spilldata = {}
    stats = {}
    we_have_spill = False
    if datasets.previous:
        prev_spilldata = blob.load('spilldata',
                                   jobid=datasets.previous,
                                   sliceno=sliceno)
        for source, data in prev_spilldata:
            spilldata[source], stats[source] = process_one(
                sliceno, options, source, prepare_res, data)
            we_have_spill |= not stats[source].virtual_spill
    if datasets.source:
        prev_params = job_params(datasets.previous, default_empty=True)
        for source in datasets.source.chain(
                stop_ds=prev_params.datasets.source):
            spilldata[source], stats[source] = process_one(
                sliceno, options, source, prepare_res)
            we_have_spill |= not stats[source].virtual_spill
    spilldata = [(k, v) for k, v in spilldata.iteritems() if v]
    if we_have_spill:
        spilldata.append((params.jobid, empty_spilldata('SPILL')))
    blob.save(spilldata, 'spilldata', sliceno=sliceno, temp=False)
    blob.save(stats, 'stats', sliceno=sliceno, temp=False)
    return we_have_spill
コード例 #4
0
ファイル: database.py プロジェクト: zmyer/accelerator
def _get_params(jobid):
	try:
		return jobid, job_params(jobid)
	except:
		from traceback import print_exc
		print_exc()
		raise
コード例 #5
0
def _job_candidates_options(candidates):
    for jobid, remset in iteritems(candidates):
        setup = job_params(jobid)
        optdiff = defaultdict(dict)
        for thing in remset:
            section, name = thing.split('-', 1)
            optdiff[section][name] = setup[section][name]
        yield jobid, optdiff
コード例 #6
0
 def __new__(cls, jobid, name=None):
     if isinstance(jobid, (tuple, list)):
         jobid = _dsid(jobid)
     elif isinstance(jobid, dict):
         assert not name, "Don't pass both a separate name and jobid as {job: dataset}"
         assert len(jobid) == 1, "Only pass a single {job: dataset}"
         jobid, dsname = next(iteritems(jobid))
         if not jobid:
             return None
         jobid = job_params(jobid, default_empty=True).datasets.get(dsname)
         if not jobid:
             return None
     if '/' in jobid:
         assert not name, "Don't pass both a separate name and jobid as jid/name"
         jobid, name = jobid.split('/', 1)
     assert jobid, "If you really meant to use yourself as a dataset, pass params.jobid explicitly."
     name = uni(name or 'default')
     assert '/' not in name
     if name == 'default':
         suffix = ''
     else:
         suffix = '/' + name
     if jobid is _new_dataset_marker:
         from g import JOBID
         fullname = JOBID + suffix
     else:
         fullname = jobid + suffix
     obj = unicode.__new__(cls, fullname)
     obj.name = uni(name or 'default')
     if jobid is _new_dataset_marker:
         obj._data = DotDict({
             'version': (
                 2,
                 2,
             ),
             'filename': None,
             'hashlabel': None,
             'caption': '',
             'columns': {},
             'parent': None,
             'previous': None,
             'lines': [],
         })
         obj.jobid = None
     else:
         obj.jobid = jobid
         obj._data = DotDict(_ds_load(obj))
         assert obj._data.version[0] == 2 and obj._data.version[
             1] >= 2, "%s/%s: Unsupported dataset pickle version %r" % (
                 jobid,
                 name,
                 obj._data.version,
             )
         obj._data.columns = dict(obj._data.columns)
     return obj
コード例 #7
0
def synthesis(params, prepare_res):
	source_params = job_params(datasets.source)
	source_params.options.caption = options.caption
	a_dataset_datesplit.real_synthesis(params, source_params.options, source_params.datasets, 0, prepare_res, False, save_discard=True)
	stats = json_load()
	json_save(dict(
		minmax              = stats.minmax_discarded,
		included_lines      = stats.discarded_lines,
		split_date          = stats.split_date,
		discard_before_date = stats.discard_before_date,
	))
コード例 #8
0
def csvexport(sliceno, filename, labelsonfirstline):
	assert len(options.separator) == 1
	assert options.quote_fields in ('', "'", '"',)
	d = datasets.source[0]
	if not options.labels:
		options.labels = sorted(d.columns)
	if options.chain_source:
		if jobids.previous:
			prev_source = job_params(jobids.previous).datasets.source
			assert len(datasets.source) == len(prev_source)
		else:
			prev_source = [None] * len(datasets.source)
		lst = []
		for src, stop in zip(datasets.source, prev_source):
			lst.extend(src.chain(stop_jobid=stop))
		datasets.source = lst
	if filename.lower().endswith('.gz'):
		mkwrite = GzWrite
	elif filename.lower().endswith('.csv'):
		def mkwrite(filename):
			return open(filename, "wb")
	else:
		raise Exception("Filename should end with .gz for compressed or .csv for uncompressed")
	iters = []
	first = True
	for label in options.labels:
		it = d.iterate_list(sliceno, label, datasets.source, status_reporting=first)
		first = False
		t = d.columns[label].type
		if t == 'unicode':
			it = imap(lambda s: s.encode('utf-8'), it)
		elif t in ('float32', 'float64', 'number'):
			it = imap(repr, it)
		elif t == 'json':
			it = imap(dumps, it)
		elif t not in ('ascii', 'bytes'):
			it = imap(str, it)
		iters.append(it)
	it = izip(*iters)
	with mkwrite(filename) as fh:
		q = options.quote_fields
		sep = options.separator
		if q:
			qq = q + q
			if labelsonfirstline:
				fh.write((sep.join(q + n.replace(q, qq) + q for n in options.labels) + '\n').encode('utf-8'))
			for data in it:
				fh.write(sep.join(q + n.replace(q, qq) + q for n in data) + '\n')
		else:
			if labelsonfirstline:
				fh.write((sep.join(options.labels) + '\n').encode('utf-8'))
			for data in it:
				fh.write(sep.join(data) + '\n')
コード例 #9
0
def execute_process(workdir,
                    jobid,
                    slices,
                    result_directory,
                    common_directory,
                    source_directory,
                    index=None,
                    workspaces=None,
                    daemon_url=None,
                    subjob_cookie=None,
                    parent_pid=0):
    g.JOBID = jobid
    setproctitle('launch')
    path = os.path.join(workdir, jobid)
    try:
        os.chdir(path)
    except Exception:
        print("Cannot cd to workdir", path)
        exit(1)

    g.params = params = job_params()
    method_ref = import_module(params.package + '.a_' + params.method)
    g.sliceno = -1

    if workspaces:
        jobid_module.put_workspaces(workspaces)

    def maybe_dataset(v):
        if isinstance(v, list):
            return [maybe_dataset(e) for e in v]
        if not v:
            return ''
        try:
            return dataset.Dataset(v)
        except IOError:
            return v

    datasets = DotDict(
        {k: maybe_dataset(v)
         for k, v in params.datasets.items()})

    g.options = params.options
    g.datasets = datasets
    g.jobids = params.jobids

    method_ref.options = params.options
    method_ref.datasets = datasets
    method_ref.jobids = params.jobids

    # compatibility names
    g.SLICES = slices
    g.JOBID = jobid
    g.jobid = jobid
    g.METHOD = params.method
    g.WORKSPACEPATH = workdir
    g.CAPTION = params.caption
    g.PACKAGE = params.package
    g.RESULT_DIRECTORY = result_directory
    g.COMMON_DIRECTORY = common_directory
    g.SOURCE_DIRECTORY = source_directory
    g.index = -1

    g.daemon_url = daemon_url
    g.running = 'launch'
    status._start('%s %s' % (
        jobid,
        params.method,
    ), parent_pid)

    def dummy():
        pass

    prepare_func = getattr(method_ref, 'prepare', dummy)
    analysis_func = getattr(method_ref, 'analysis', dummy)
    synthesis_func = getattr(method_ref, 'synthesis', dummy)

    synthesis_needs_analysis = 'analysis_res' in getarglist(synthesis_func)

    # A chain must be finished from the back, so sort on that.
    sortnum_cache = {}

    def dw_sortnum(name):
        if name not in sortnum_cache:
            dw = dataset._datasetwriters[name]
            if dw.previous and dw.previous.startswith(jobid + '/'):
                pname = dw.previous.split('/')[1]
                num = dw_sortnum(pname) + 1
            else:
                num = 0
            sortnum_cache[name] = num
        return sortnum_cache[name]

    prof = {}
    if prepare_func is dummy:
        prof['prepare'] = 0  # truthish!
    else:
        t = time()
        g.running = 'prepare'
        g.subjob_cookie = subjob_cookie
        setproctitle(g.running)
        with status.status(g.running):
            g.prepare_res = method_ref.prepare(**args_for(method_ref.prepare))
            to_finish = [
                dw.name for dw in dataset._datasetwriters.values()
                if dw._started
            ]
            if to_finish:
                with status.status("Finishing datasets"):
                    for name in sorted(to_finish, key=dw_sortnum):
                        dataset._datasetwriters[name].finish()
        prof['prepare'] = time() - t
    setproctitle('launch')
    from extras import saved_files
    if analysis_func is dummy:
        prof['per_slice'] = []
        prof['analysis'] = 0
    else:
        t = time()
        g.running = 'analysis'
        g.subjob_cookie = None  # subjobs are not allowed from analysis
        with status.status(
                'Waiting for all slices to finish analysis') as update:
            g.update_top_status = update
            prof['per_slice'], files, g.analysis_res = fork_analysis(
                slices, analysis_func, args_for(analysis_func),
                synthesis_needs_analysis)
            del g.update_top_status
        prof['analysis'] = time() - t
        saved_files.update(files)
    t = time()
    g.running = 'synthesis'
    g.subjob_cookie = subjob_cookie
    setproctitle(g.running)
    with status.status(g.running):
        synthesis_res = synthesis_func(**args_for(synthesis_func))
        if synthesis_res is not None:
            blob.save(synthesis_res, temp=False)
        if dataset._datasetwriters:
            with status.status("Finishing datasets"):
                for name in sorted(dataset._datasetwriters, key=dw_sortnum):
                    dataset._datasetwriters[name].finish()
    t = time() - t
    prof['synthesis'] = t

    from subjobs import _record
    status._end()
    return None, (prof, saved_files, _record)
コード例 #10
0
def prepare():
    source_params = job_params(datasets.source)
    return a_dataset_datesplit.real_prepare(datasets.source, datasets.previous,
                                            source_params.options)
コード例 #11
0
def main(urd):
    resetlocale()

    if False:
        # One BILLION rows
        # This takes about half an hour on a fast machine
        num_rows = int(1e7)
        num_datasets = 100
    else:
        # One MILLION rows
        num_rows = int(1e6)
        num_datasets = 10

    # Create datasets
    print("\x1b[1m(1) Create chain of datasets.\x1b[m")
    jid = None
    for _ in range(num_datasets):
        jid = urd.build('example_perf_gendata',
                        options=dict(num_rows=num_rows),
                        datasets=dict(previous=jid))

    # Export chain of datasets to CSV-file.
    print("\x1b[1m(2) Export dataset chain to CSV file.\x1b[m")
    jid = urd.build('csvexport',
                    datasets=dict(source=jid),
                    options=dict(filename='out.csv.gz', chain_source=True))

    filename = resolve_jobid_filename(jid, 'out.csv.gz')
    print('Exported file stored in \"%s\"' % (filename, ))

    # Import and type previously exported CSV-file.
    print("\x1b[1m(3) Import dataset from CVS file.\x1b[m")
    jid = urd.build('csvimport', options=dict(filename=filename))
    opts = dict(
        column2type={
            'a string': 'ascii',
            'large number': 'number',
            'small number': 'number',
            'small integer': 'int32_10',  # you must specify base for integers
            'gauss number': 'number',
            'gauss float': 'float64',
        }, )
    print("\x1b[1m(4) Type imported dataset.\x1b[m")
    jid = urd.build('dataset_type', datasets=dict(source=jid), options=opts)

    # Sum all values in a column.  Repeat for a set of columns with different types.
    print("\x1b[1m(5) Run some methods on the typed dataset.\x1b[m")
    jid_single = jid
    source = jid_single
    for colname in ('small number', 'small integer', 'large number',
                    'gauss number', 'gauss float'):
        print(colname)
        jid = urd.build('example_perf_sum',
                        datasets=dict(source=source),
                        options=dict(colname=colname),
                        name='sum ' + colname)
        jid = urd.build('example_perf_sum_positive',
                        datasets=dict(source=source),
                        options=dict(colname=colname),
                        name='sum positive ' + colname)

    # Compute histograms of a column
    print('histogram')
    jid = urd.build('example_perf_histogram',
                    datasets=dict(source=source),
                    options=dict(colname='gauss number'),
                    name='histogram_number')
    jid = urd.build('example_perf_histogram',
                    datasets=dict(source=source),
                    options=dict(colname='gauss float'),
                    name='histogram_float')

    # Find string
    print('find string')
    jid = urd.build('example_perf_find_string',
                    datasets=dict(source=source),
                    options=dict(colname='a string', text='ExAx'),
                    name='find_string')
    print(
        "Number of lines containing string \"%s\" is %d." %
        (job_params(jid).options['text'], blob.load(jobid=jid)), )

    # Print resulting profiling information
    from automata_common import profile_jobs
    print()

    def pl(text, time):
        print("%-30s %10.3f %14s" % (
            text,
            time,
            '{0:n}'.format(round(num_rows * num_datasets / time)),
        ))

    print()
    print('-' * 56)
    print("operation                       exec time         rows/s")
    print()
    pl('csvexport', profile_jobs(urd.joblist.find('csvexport')))
    print()
    pl(
        'reimport total',
        profile_jobs(
            urd.joblist.find('csvimport') + urd.joblist.find('dataset_type')))
    pl("   csvimport         ", profile_jobs(urd.joblist.find('csvimport')))
    pl("   type              ", profile_jobs(urd.joblist.find('dataset_type')))
    print()
    print("sum")
    pl("  small number       ",
       profile_jobs(urd.joblist.find('sum small number')))
    pl("  small integer      ",
       profile_jobs(urd.joblist.find('sum small integer')))
    pl("  large number       ",
       profile_jobs(urd.joblist.find('sum large number')))
    pl("  gauss number       ",
       profile_jobs(urd.joblist.find('sum gauss number')))
    pl("  gauss float        ",
       profile_jobs(urd.joblist.find('sum gauss float')))
    print()
    print("sum positive")
    pl("  small number       ",
       profile_jobs(urd.joblist.find('sum positive small number')))
    pl("  small integer      ",
       profile_jobs(urd.joblist.find('sum positive small integer')))
    pl("  large number       ",
       profile_jobs(urd.joblist.find('sum positive large number')))
    pl("  gauss number       ",
       profile_jobs(urd.joblist.find('sum positive gauss number')))
    pl("  gauss float        ",
       profile_jobs(urd.joblist.find('sum positive gauss float')))
    print()
    print("histogram")
    pl("  number             ",
       profile_jobs(urd.joblist.find('histogram_number')))
    pl("  float              ",
       profile_jobs(urd.joblist.find('histogram_float')))
    print()
    pl("find string          ", profile_jobs(urd.joblist.find('find_string')))
    print()
    print("Total test time                %10.3f" %
          (profile_jobs(urd.joblist), ))
    print()
    print('Example size is %s lines.' %
          ('{0:n}'.format(num_datasets * num_rows), ))
    print('Number of slices is %d.' % (urd.info.slices, ))
    print('-' * 56)
コード例 #12
0
 def x2opt(jobid, optname="previous"):
     params = job_params(jobid)
     return params.jobids.get(optname) or params.datasets.get(optname)