Example #1
0
 def column_filename(self, colname, sliceno=None):
     dc = self.columns[colname]
     jid, name = dc.location.split('/', 1)
     if dc.offsets:
         return resolve_jobid_filename(jid, name)
     else:
         if sliceno is None:
             sliceno = '%s'
         return resolve_jobid_filename(jid, name % (sliceno, ))
Example #2
0
	def run_job(self, jobid, subjob_cookie=None, parent_pid=0):
		W = self.workspaces[get_workspace_name(jobid)]
		#
		active_workspaces = {}
		for name in self.source_workdirs:
			active_workspaces[name] = self.workspaces[name].get_path()
		slices = self.workspaces[self.target_workdir].get_slices()

		t0 = time.time()
		setup = update_setup(jobid, starttime=t0)
		prof = setup.profile or DotDict()
		new_prof, files, subjobs = dispatch.launch(W.path, setup, self.config, self.Methods, active_workspaces, slices, self.debug, self.daemon_url, subjob_cookie, parent_pid)
		if self.debug:
			delete_from = Temp.TEMP
		else:
			delete_from = Temp.DEBUG
		for filename, temp in list(files.items()):
			if temp >= delete_from:
				unlink(join(W.path, jobid, filename))
				del files[filename]
		prof.update(new_prof)
		prof.total = 0
		prof.total = sum(v for v in prof.values() if isinstance(v, (float, int)))
		data = dict(
			starttime=t0,
			endtime=time.time(),
			profile=prof,
		)
		update_setup(jobid, **data)
		data['files'] = files
		data['subjobs'] = subjobs
		json_save(data, resolve_jobid_filename(jobid, 'post.json'))
Example #3
0
def main(urd):
	urd.build("test_json")

	print()
	print("Testing dataset creation, export, import")
	source = urd.build("test_datasetwriter")
	urd.build("test_datasetwriter_verify", datasets=dict(source=source))
	ds = Dataset(source, "passed")
	csvname = "out.csv.gz"
	csv = urd.build("csvexport", options=dict(filename=csvname, separator="\t"), datasets=dict(source=ds))
	csv_quoted = urd.build("csvexport", options=dict(filename=csvname, quote_fields='"'), datasets=dict(source=ds))
	reimp_csv = urd.build("csvimport", options=dict(filename=resolve_jobid_filename(csv, csvname), separator="\t"))
	reimp_csv_quoted = urd.build("csvimport", options=dict(filename=resolve_jobid_filename(csv_quoted, csvname), quote_support=True))
	urd.build("test_compare_datasets", datasets=dict(a=reimp_csv, b=reimp_csv_quoted))
	urd.build("test_dataset_column_names")

	print()
	print("Testing csvimport with more difficult files")
	urd.build("test_csvimport_corner_cases")
	urd.build("test_csvimport_separators")

	print()
	print("Testing subjobs and dataset typing")
	urd.build("test_subjobs_type", datasets=dict(typed=ds, untyped=reimp_csv))
	urd.build("test_dataset_old_columns")

	print()
	print("Testing dataset chaining, filtering, callbacks and rechaining")
	selfchain = urd.build("test_selfchain")
	urd.build("test_rechain", jobids=dict(selfchain=selfchain))

	print()
	print("Testing dataset sorting (with subjobs again)")
	urd.build("test_sorting")
	urd.build("test_sort_stability")

	print()
	print("Test hashlabels")
	urd.build("test_hashlabel")
Example #4
0
def full_filename(filename, ext, sliceno=None, jobid=None):
	if not filename or not filename[0]:
		# Fallback to default in calling function
		return None
	if isinstance(filename, JobWithFile):
		if jobid:
			raise Exception("Don't specify a jobid when passing a JobWithFile as filename")
		if sliceno is None:
			assert not filename.sliced, "A sliced file requires a sliceno"
		else:
			assert filename.sliced, "An unsliced file can not have a sliceno"
		jobid, filename = filename[:2]
	if not filename.endswith(ext):
		filename += ext
	if sliceno is not None:
		filename = filename.replace(ext, '%02d' % (int(sliceno),)) + ext
	if jobid is not None:
		filename = resolve_jobid_filename(jobid, filename)
	return filename
Example #5
0
def analysis(sliceno, params, prepare_res):
	dw, jobs, sort_idx = prepare_res
	single_job = (len(jobs) == 1)
	if options.sort_across_slices:
		columniter = partial(Dataset.iterate_list, None, jobids=jobs)
		per_slice = len(sort_idx) // params.slices
		if sliceno + 1 ==  params.slices:
			sort_idx = sort_idx[per_slice * sliceno:]
		else:
			sort_idx = sort_idx[per_slice * sliceno:per_slice * (sliceno + 1)]
	else:
		columniter = partial(Dataset.iterate_list, sliceno, jobids=jobs)
		sort_idx = sort(columniter)
	if single_job and not options.sort_across_slices and sort_idx == sorted(sort_idx):
		# this slice is fully sorted as is.
		slice_dir = '%02d' % (sliceno,)
		symlink(resolve_jobid_filename(datasets.source, slice_dir), slice_dir)
		return len(sort_idx)
	for column in datasets.source.columns:
		lst = list(columniter(column))
		w = dw.writers[column].write
		for idx in sort_idx:
			w(lst[idx])
Example #6
0
def main(urd):
    urd.build("test_report")
    urd.build("test_json")

    print()
    print("Testing dataset creation, export, import")
    source = urd.build("test_datasetwriter")
    urd.build("test_datasetwriter_verify", datasets=dict(source=source))
    ds = Dataset(source, "passed")
    csvname = "out.csv.gz"
    csvname_uncompressed = "out.csv"
    csv = urd.build("csvexport",
                    options=dict(filename=csvname, separator="\t"),
                    datasets=dict(source=ds))
    csv_uncompressed = urd.build("csvexport",
                                 options=dict(filename=csvname_uncompressed,
                                              separator="\t"),
                                 datasets=dict(source=ds))
    csv_quoted = urd.build("csvexport",
                           options=dict(filename=csvname, quote_fields='"'),
                           datasets=dict(source=ds))
    reimp_csv = urd.build("csvimport",
                          options=dict(filename=resolve_jobid_filename(
                              csv, csvname),
                                       separator="\t"))
    reimp_csv_uncompressed = urd.build(
        "csvimport",
        options=dict(filename=resolve_jobid_filename(csv_uncompressed,
                                                     csvname_uncompressed),
                     separator="\t"))
    reimp_csv_quoted = urd.build("csvimport",
                                 options=dict(filename=resolve_jobid_filename(
                                     csv_quoted, csvname),
                                              quotes=True))
    urd.build("test_compare_datasets",
              datasets=dict(a=reimp_csv, b=reimp_csv_uncompressed))
    urd.build("test_compare_datasets",
              datasets=dict(a=reimp_csv, b=reimp_csv_quoted))
    urd.build("test_dataset_column_names")

    print()
    print("Testing csvimport with more difficult files")
    urd.build("test_csvimport_corner_cases")
    urd.build("test_csvimport_separators")

    print()
    print("Testing subjobs and dataset typing")
    urd.build("test_subjobs_type", datasets=dict(typed=ds, untyped=reimp_csv))
    urd.build("test_dataset_old_columns")
    # This one is so you get a more useful error message if numeric_comma is broken.
    urd.build("dataset_type",
              datasets=dict(source=source),
              options=dict(numeric_comma=True,
                           column2type=dict(b="float64"),
                           defaults=dict(b="0")))
    urd.build("test_dataset_type_corner_cases")

    print()
    print("Testing dataset chaining, filtering, callbacks and rechaining")
    selfchain = urd.build("test_selfchain")
    urd.build("test_rechain", jobids=dict(selfchain=selfchain))

    print()
    print("Testing dataset sorting and rehashing (with subjobs again)")
    urd.build("test_sorting")
    urd.build("test_sort_stability")
    urd.build("test_rehash")

    print()
    print("Test hashlabels")
    urd.build("test_hashlabel")

    print()
    print("Test dataset_checksum")
    urd.build("test_dataset_checksum")

    print()
    print("Test csvimport_zip")
    urd.build("test_csvimport_zip")
def main(urd):

    # Example 1.  Create a chain of datasets containing random data.
    jid_prev = None
    for n in range(5):
        jid_ds = urd.build(
            'example1_create_dataset',
            datasets=dict(previous=jid_prev),
            options=dict(approx_rows=100000, seed=n),
            name='Created_number_%s' % (n, ),
        )
        jid_prev = jid_ds

    # Example 2.  Export the last dataset in the chain to a tab
    #             separated textfile.
    jid_exp = urd.build(
        'csvexport',
        datasets=dict(source=jid_ds),
        options=dict(filename='random.tsv', separator='\t'),
    )
    filename = resolve_jobid_filename(jid_exp, 'random.tsv')
    print('Exported file stored in \"%s\"' % (filename, ))

    # Example 3.  Import the tab separated textfile and type it
    jid_imp = urd.build(
        'csvimport',
        options=dict(filename=filename, separator='\t',
                     labelsonfirstline=True),
    )
    jid_typ = urd.build(
        'dataset_type',
        datasets=dict(source=jid_imp),
        options=dict(column2type=dict(rflt='number', rint='number')),
    )

    # Example 4.  Run a method computing the average of a column, in a
    #             loop, one column at a time.  The column name is an
    #             input parameter.
    for column in Dataset(jid_typ).columns:
        jid_avg = urd.build(
            'example1_calc_average',
            datasets=dict(source=jid_typ),
            options=dict(column=column),
        )
        (s, n) = blob.load(jobid=jid_avg)
        print("Column %s:  sum=%f, length=%d, average=%f" %
              (column, s, n, s / n))

    # Example 5.  Create a new column that is the product of two
    #             existing columns.
    jid_add = urd.build(
        'example1_add_column',
        datasets=dict(source=jid_typ),
    )

    # Example 6.  Export a dataset with named columns in specified
    #             order.
    jid_add_exp = urd.build(
        'csvexport',
        datasets=dict(source=jid_add),
        options=dict(filename='prod.csv', labels=(
            'prod',
            'rflt',
            'rint',
        )),
    )

    print(urd.joblist.pretty)