def check_one(params, line_sep, sep, data, want_res=None, prefix="", quotes=False, leave_bad=False): sep_c = chr(sep) # Can't have separator character in unquoted values if not quotes and not leave_bad: data = [[el.replace(sep_c, "") for el in line] for line in data] if not want_res: want_res = [ tuple(s.encode("ascii") for s in line) for line in data[1:] ] filename = "%s_csv.%d.%r.txt" % (prefix, sep, line_sep) with open(filename, "w") as fh: for line in data: if quotes: line = [ quotes + el.replace(quotes, quotes + quotes) + quotes for el in line ] fh.write(sep_c.join(line)) fh.write(line_sep) try: jid = subjobs.build("csvimport", options=dict( filename=resolve_jobid_filename( params.jobid, filename), separator=sep_c, quote_support=bool(quotes), )) except JobError as e: raise CSVImportException( "Failed to csvimport for separator %d with line separator %r, csvimport error was:\n%s" % (sep, line_sep, e.format_msg())) ds = Dataset(jid) labels = sorted(ds.columns) if labels != data[0]: raise WrongLabelsException( "csvimport gave wrong labels for separator %d with line separator %r: %r (expected %r)" % ( sep, line_sep, labels, data[0], )) res = list(ds.iterate(None, data[0])) if res != want_res: raise WrongDataException( "csvimport gave wrong data for separator %d with line separator %r: %r (expected %r)" % ( sep, line_sep, res, want_res, ))
def verify(zipname, inside_filenames, want_ds, **kw): opts = dict( filename=resolve_jobid_filename(g.jobid, zipname), inside_filenames=inside_filenames, ) opts.update(kw) jid = subjobs.build('csvimport_zip', options=opts) for dsn, want_data in want_ds.items(): got_data = list(Dataset(jid, dsn).iterate(None, '0')) assert got_data == want_data, "%s/%s from %s didn't contain %r, instead contained %r" % ( jid, dsn, zipname, want_data, got_data)
def _printlist(self, returndict): # print (return list) in neat format for method, item in sorted(returndict.items(), key=lambda x: x[1].link): if item.make == True: make_msg = 'MAKE' else: make_msg = item.make or 'link' print(' - %44s' % method.ljust(44), end=' ') print(' %s' % (make_msg, ), end=' ') if self.print_full_jobpath: print(' %s' % resolve_jobid_filename(item.link, ''), end=' ') else: print(' %s' % item.link, end=' ') if item.make != True: print(' %s' % fmttime(item.total_time), end=' ') print()
def check_no_separator(params): def write(data): fh.write(data + nl_b) wrote_c[data] += 1 if q_b: data = q_b + data + q_b fh.write(q_b + data.replace(q_b, q_b + q_b) + q_b + nl_b) wrote_c[data] += 1 for nl in (10, 0, 255): for q in (None, 0, 34, 13, 10, 228): if nl == q: continue filename = "no separator.%r.%r.txt" % ( nl, q, ) nl_b = bytechr(nl) q_b = bytechr(q) if q else b'' wrote_c = Counter() with openx(filename) as fh: for splitpoint in range(256): write(byteline(0, splitpoint, nl, q)) write(byteline(splitpoint, 256, nl, q)) try: jid = subjobs.build("csvimport", options=dict( filename=resolve_jobid_filename( params.jobid, filename), quotes=q_b.decode("iso-8859-1"), newline=nl_b.decode("iso-8859-1"), separator='', labelsonfirstline=False, labels=["data"], )) except JobError: raise Exception("Importing %r failed" % (filename, )) got_c = Counter(Dataset(jid).iterate(None, "data")) assert got_c == wrote_c, "Importing %r (%s) gave wrong contents" % ( filename, jid, )
def check_array(params, lines, filename, bad_lines=(), **options): d = {} with openx(filename) as fh: for ix, data in enumerate(bad_lines, 1): ix = str(ix).encode("ascii") fh.write(ix + b"," + data + b"," + data + b"\n") for ix, data in enumerate(lines, len(bad_lines) + 1): if isinstance(data, tuple): data, d[ix] = data else: d[ix] = data ix = str(ix).encode("ascii") fh.write(ix + b"," + data + b"," + data + b"\n") options.update( filename=resolve_jobid_filename(params.jobid, filename), allow_bad=bool(bad_lines), labelsonfirstline=False, labels=["ix", "0", "1"], ) verify_ds(options, d, filename)
def tmpfn(): cnt = 0 while True: cnt += 1 yield resolve_jobid_filename(params.jobid, str(cnt))
def check_good_file(params, name, data, d, d_bad={}, d_skipped={}, **options): filename = name + ".txt" with openx(filename) as fh: fh.write(data) options.update(filename=resolve_jobid_filename(params.jobid, filename), ) verify_ds(options, d, d_bad, d_skipped, filename)
def check_bad_file(params, name, data): filename = name + ".txt" with openx(filename) as fh: fh.write(data) options = dict(filename=resolve_jobid_filename(params.jobid, filename), ) require_failure(name, options)
def main(urd): resetlocale() if False: # One BILLION rows # This takes about half an hour on a fast machine num_rows = int(1e7) num_datasets = 100 else: # One MILLION rows num_rows = int(1e6) num_datasets = 10 # Create datasets print("\x1b[1m(1) Create chain of datasets.\x1b[m") jid = None for _ in range(num_datasets): jid = urd.build('example_perf_gendata', options=dict(num_rows=num_rows), datasets=dict(previous=jid)) # Export chain of datasets to CSV-file. print("\x1b[1m(2) Export dataset chain to CSV file.\x1b[m") jid = urd.build('csvexport', datasets=dict(source=jid), options=dict(filename='out.csv.gz', chain_source=True)) filename = resolve_jobid_filename(jid, 'out.csv.gz') print('Exported file stored in \"%s\"' % (filename, )) # Import and type previously exported CSV-file. print("\x1b[1m(3) Import dataset from CVS file.\x1b[m") jid = urd.build('csvimport', options=dict(filename=filename)) opts = dict( column2type={ 'a string': 'ascii', 'large number': 'number', 'small number': 'number', 'small integer': 'int32_10', # you must specify base for integers 'gauss number': 'number', 'gauss float': 'float64', }, ) print("\x1b[1m(4) Type imported dataset.\x1b[m") jid = urd.build('dataset_type', datasets=dict(source=jid), options=opts) # Sum all values in a column. Repeat for a set of columns with different types. print("\x1b[1m(5) Run some methods on the typed dataset.\x1b[m") jid_single = jid source = jid_single for colname in ('small number', 'small integer', 'large number', 'gauss number', 'gauss float'): print(colname) jid = urd.build('example_perf_sum', datasets=dict(source=source), options=dict(colname=colname), name='sum ' + colname) jid = urd.build('example_perf_sum_positive', datasets=dict(source=source), options=dict(colname=colname), name='sum positive ' + colname) # Compute histograms of a column print('histogram') jid = urd.build('example_perf_histogram', datasets=dict(source=source), options=dict(colname='gauss number'), name='histogram_number') jid = urd.build('example_perf_histogram', datasets=dict(source=source), options=dict(colname='gauss float'), name='histogram_float') # Find string print('find string') jid = urd.build('example_perf_find_string', datasets=dict(source=source), options=dict(colname='a string', text='ExAx'), name='find_string') print( "Number of lines containing string \"%s\" is %d." % (job_params(jid).options['text'], blob.load(jobid=jid)), ) # Print resulting profiling information from automata_common import profile_jobs print() def pl(text, time): print("%-30s %10.3f %14s" % ( text, time, '{0:n}'.format(round(num_rows * num_datasets / time)), )) print() print('-' * 56) print("operation exec time rows/s") print() pl('csvexport', profile_jobs(urd.joblist.find('csvexport'))) print() pl( 'reimport total', profile_jobs( urd.joblist.find('csvimport') + urd.joblist.find('dataset_type'))) pl(" csvimport ", profile_jobs(urd.joblist.find('csvimport'))) pl(" type ", profile_jobs(urd.joblist.find('dataset_type'))) print() print("sum") pl(" small number ", profile_jobs(urd.joblist.find('sum small number'))) pl(" small integer ", profile_jobs(urd.joblist.find('sum small integer'))) pl(" large number ", profile_jobs(urd.joblist.find('sum large number'))) pl(" gauss number ", profile_jobs(urd.joblist.find('sum gauss number'))) pl(" gauss float ", profile_jobs(urd.joblist.find('sum gauss float'))) print() print("sum positive") pl(" small number ", profile_jobs(urd.joblist.find('sum positive small number'))) pl(" small integer ", profile_jobs(urd.joblist.find('sum positive small integer'))) pl(" large number ", profile_jobs(urd.joblist.find('sum positive large number'))) pl(" gauss number ", profile_jobs(urd.joblist.find('sum positive gauss number'))) pl(" gauss float ", profile_jobs(urd.joblist.find('sum positive gauss float'))) print() print("histogram") pl(" number ", profile_jobs(urd.joblist.find('histogram_number'))) pl(" float ", profile_jobs(urd.joblist.find('histogram_float'))) print() pl("find string ", profile_jobs(urd.joblist.find('find_string'))) print() print("Total test time %10.3f" % (profile_jobs(urd.joblist), )) print() print('Example size is %s lines.' % ('{0:n}'.format(num_datasets * num_rows), )) print('Number of slices is %d.' % (urd.info.slices, )) print('-' * 56)