def link_to_here(self, name='default', column_filter=None): """Use this to expose a subjob as a dataset in your job: Dataset(subjid).link_to_here() will allow access to the subjob dataset under your jid. Specify column_filter as an iterable of columns to include if you don't want all of them.""" if column_filter: column_filter = set(column_filter) filtered_columns = { k: v for k, v in self._data.columns.items() if k in column_filter } left_over = column_filter - set(filtered_columns) assert not left_over, "Columns in filter not available in dataset: %r" % ( left_over, ) assert filtered_columns, "Filter produced no desired columns." self._data.columns = filtered_columns from g import JOBID self._data.parent = '%s/%s' % ( self.jobid, self.name, ) self.jobid = uni(JOBID) self.name = uni(name) self._save()
def append(self, columns, filenames, lines, minmax={}, filename=None, hashlabel=None, hashlabel_override=False, caption=None, previous=None, name='default'): if hashlabel: hashlabel = uni(hashlabel) if not hashlabel_override: assert self.hashlabel == hashlabel, 'Hashlabel mismatch %s != %s' % (self.hashlabel, hashlabel,) assert self._linefixup(lines) == self.lines, "New columns don't have the same number of lines as parent columns" columns = {uni(k): uni(v) for k, v in columns.items()} self._append(columns, filenames, minmax, filename, caption, previous, name)
def __new__(cls, jobid, name=None): if isinstance(jobid, (tuple, list)): jobid = _dsid(jobid) elif isinstance(jobid, dict): assert not name, "Don't pass both a separate name and jobid as {job: dataset}" assert len(jobid) == 1, "Only pass a single {job: dataset}" jobid, dsname = next(iteritems(jobid)) if not jobid: return None jobid = job_params(jobid, default_empty=True).datasets.get(dsname) if not jobid: return None if '/' in jobid: assert not name, "Don't pass both a separate name and jobid as jid/name" jobid, name = jobid.split('/', 1) assert jobid, "If you really meant to use yourself as a dataset, pass params.jobid explicitly." name = uni(name or 'default') assert '/' not in name if name == 'default': suffix = '' else: suffix = '/' + name if jobid is _new_dataset_marker: from g import JOBID fullname = JOBID + suffix else: fullname = jobid + suffix obj = unicode.__new__(cls, fullname) obj.name = uni(name or 'default') if jobid is _new_dataset_marker: obj._data = DotDict({ 'version': ( 2, 2, ), 'filename': None, 'hashlabel': None, 'caption': '', 'columns': {}, 'parent': None, 'previous': None, 'lines': [], }) obj.jobid = None else: obj.jobid = jobid obj._data = DotDict(_ds_load(obj)) assert obj._data.version[0] == 2 and obj._data.version[ 1] >= 2, "%s/%s: Unsupported dataset pickle version %r" % ( jobid, name, obj._data.version, ) obj._data.columns = dict(obj._data.columns) return obj
def _dsid(t): if not t: return None if isinstance(t, (tuple, list)): jid, name = t if not jid: return None t = '%s/%s' % (jid.split('/')[0], uni(name) or 'default') if '/' not in t: t += '/default' return uni(t)
def new(columns, filenames, lines, minmax={}, filename=None, hashlabel=None, caption=None, previous=None, name='default'): """columns = {"colname": "type"}, lines = [n, ...] or {sliceno: n}""" columns = {uni(k): uni(v) for k, v in columns.items()} if hashlabel: hashlabel = uni(hashlabel) assert hashlabel in columns, hashlabel res = Dataset(_new_dataset_marker, name) res._data.lines = list(Dataset._linefixup(lines)) res._data.hashlabel = hashlabel res._append(columns, filenames, minmax, filename, caption, previous, name) return res
def __new__(cls, columns={}, filename=None, hashlabel=None, hashlabel_override=False, caption=None, previous=None, name='default', parent=None, meta_only=False, for_single_slice=None): """columns can be {'name': 'type'} or {'name': DatasetColumn} to simplify basing your dataset on another.""" name = uni(name) assert '/' not in name, name from g import running if running == 'analysis': assert name in _datasetwriters, 'Dataset with name "%s" not created' % ( name, ) assert not columns and not filename and not hashlabel and not caption and not parent and for_single_slice is None, "Don't specify any arguments (except optionally name) in analysis" return _datasetwriters[name] else: assert name not in _datasetwriters, 'Duplicate dataset name "%s"' % ( name, ) os.mkdir(name) obj = object.__new__(cls) obj._running = running obj.filename = uni(filename) obj.hashlabel = uni(hashlabel) obj.hashlabel_override = hashlabel_override, obj.caption = uni(caption) obj.previous = _dsid(previous) obj.name = uni(name) obj.parent = _dsid(parent) obj.columns = {} obj.meta_only = meta_only obj._for_single_slice = for_single_slice obj._clean_names = {} if parent: obj._pcolumns = Dataset(parent).columns obj._seen_n = set(c.name for c in obj._pcolumns.values()) else: obj._pcolumns = {} obj._seen_n = set() obj._started = False obj._lens = {} obj._minmax = {} obj._order = [] for k, v in sorted(columns.items()): if isinstance(v, tuple): v = v.type obj.add(k, v) _datasetwriters[name] = obj return obj
def add(self, colname, coltype, default=_nodefault): from g import running assert running == self._running, "Add all columns in the same step as creation" assert not self._started, "Add all columns before setting slice" colname = uni(colname) coltype = uni(coltype) assert colname not in self.columns, colname assert colname typed_writer(coltype) # gives error for unknown types self.columns[colname] = (coltype, default) self._order.append(colname) if colname in self._pcolumns: self._clean_names[colname] = self._pcolumns[colname].name else: self._clean_names[colname] = _clean_name(colname, self._seen_n)
def _append(self, columns, filenames, minmax, filename, caption, previous, name): from sourcedata import type2iter from g import JOBID jobid = uni(JOBID) name = uni(name) filenames = {uni(k): uni(v) for k, v in filenames.items()} assert set(columns) == set(filenames), "columns and filenames don't have the same keys" if self.jobid and (self.jobid != jobid or self.name != name): self._data.parent = '%s/%s' % (self.jobid, self.name,) self.jobid = jobid self.name = name self._data.filename = uni(filename) or self._data.filename or None self._data.caption = uni(caption) or self._data.caption or jobid self._data.previous = _dsid(previous) for n in ('cache', 'cache_distance'): if n in self._data: del self._data[n] minmax = self._minmax_merge(minmax) for n, t in sorted(columns.items()): if t not in type2iter: raise Exception('Unknown type %s on column %s' % (t, n,)) mm = minmax.get(n, (None, None,)) self._data.columns[n] = DatasetColumn( type=uni(t), name=filenames[n], location='%s/%s/%%s.%s' % (jobid, self.name, filenames[n]), min=mm[0], max=mm[1], offsets=None, ) self._maybe_merge(n) self._update_caches() self._save()
def typefix(e): if isinstance(e, dict): return dict_type((typefix(k), typefix(v)) for k, v in iteritems(e)) elif isinstance(e, (list, tuple, set,)): return [typefix(v) for v in e] elif PY2 and isinstance(e, bytes): return uni(e) else: return e
def _options(self, optionsdict, title='Options'): if not optionsdict: return self.println(title) maxlen = max(len(k) for k in optionsdict) for k, v in sorted(optionsdict.items()): k = uni(k).ljust(maxlen) if isinstance(v, (list, tuple)): self.println(' %s :' % (k, )) for t in v: self.println(' %s %s' % ( ' ' * maxlen, uni(t), )) else: self.println(" %s : %s " % ( k, uni(v), ))
def printvec(self, vec, columns): spacing = 80 // columns - 6 for ix, x in enumerate(vec): self.write(' %3d %s' % ( ix, uni(x).ljust(spacing), )) if ix % columns == columns - 1: self.write('\n') if ix % columns != columns - 1: self.write('\n')
def link_to_here(self, name='default', column_filter=None, override_previous=_no_override): """Use this to expose a subjob as a dataset in your job: Dataset(subjid).link_to_here() will allow access to the subjob dataset under your jid. Specify column_filter as an iterable of columns to include if you don't want all of them. Use override_previous to rechain (or unchain) the dataset. """ d = Dataset(self) if column_filter: column_filter = set(column_filter) filtered_columns = { k: v for k, v in d._data.columns.items() if k in column_filter } left_over = column_filter - set(filtered_columns) assert not left_over, "Columns in filter not available in dataset: %r" % ( left_over, ) assert filtered_columns, "Filter produced no desired columns." d._data.columns = filtered_columns from g import JOBID if override_previous is not _no_override: override_previous = _dsid(override_previous) if override_previous: # make sure it's valid Dataset(override_previous) d._data.previous = override_previous d._update_caches() d._data.parent = '%s/%s' % ( d.jobid, d.name, ) d.jobid = uni(JOBID) d.name = uni(name) d._save()
def __exit__(self, type, value, tb): # We don't care if an exception occured, we still want to save # the report. # But if saving the report produces an exception we want to # ignore that and re-raise the original exception (or raise # our own exception if no original exception exists). try: if tb is None: self.line() with open('report.txt', 'w', encoding='utf-8') as F: F.write(uni(self.s)) if self.stdout: print(self.s) except Exception: # This logic looks backwards, but it isn't if tb is None: raise finally: self._closed = True
def close(self): self.line() with open('report.txt', 'w', encoding='utf-8') as F: F.write(uni(self.s)) if self.stdout: print(self.s)
def write(self, s): assert not self._closed, 'Closed.' self.s += uni(s)
def synthesis(params): check_good_file(params, "mixed line endings", b"ix,0,1\r\n1,a,a\n2,b,b\r\n3,c,c", { 1: b"a", 2: b"b", 3: b"c" }) check_good_file(params, "ignored quotes", b"ix,0,1\n1,'a,'a\n2,'b','b'\n3,\"c\",\"c\"\n4,d',d'\n", { 1: b"'a", 2: b"'b'", 3: b'"c"', 4: b"d'" }) check_good_file(params, "ignored quotes and extra fields", b"ix,0,1\n1,\"a,\"a\n2,'b,c',d\n3,d\",d\"\n", { 1: b'"a', 3: b'd"' }, allow_bad=True, d_bad={3: b"2,'b,c',d"}) check_good_file( params, "spaces and quotes", b"ix,0,1\none,a,a\ntwo, b, b\n three,c,c\n4,\"d\"\"\",d\"\n5, 'e',\" 'e'\"\n", { b"one": b"a", b"two": b" b", b" three": b"c", 4: b'd"', 5: b" 'e'" }, quotes=True) check_good_file(params, "empty fields", b"ix,0,1\n1,,''\n2,,\n3,'',\n4,\"\",", { 1: b"", 2: b"", 3: b"", 4: b"" }, quotes=True) check_good_file(params, "renamed fields", b"0,1,2\n0,foo,foo", {0: b"foo"}, rename={ "0": "ix", "2": "0" }) check_good_file(params, "discarded field", b"ix,0,no,1\n0,yes,no,yes\n1,a,'foo,bar',a", { 0: b"yes", 1: b"a" }, quotes=True, discard={"no"}) check_good_file( params, "bad quotes", b"""ix,0,1\n1,a,a\n2,"b,"b\n\n3,'c'c','c'c'\n4,"d",'d'\n""", { 1: b"a", 4: b"d" }, quotes=True, allow_bad=True, d_bad={ 3: b'2,"b,"b', 4: b"", 5: b"3,'c'c','c'c'" }) check_good_file(params, "comments", b"""# blah\nix,0,1\n1,a,a\n2,b,b\n#3,c,c\n4,#d,#d\n""", { 1: b"a", 2: b"b", 4: b"#d" }, comment="#", d_skipped={ 1: b"# blah", 5: b"#3,c,c" }) check_good_file(params, "not comments", b"""ix,0,1\n1,a,a\n2,b,b\n#3,c,c\n4,#d,#d\n""", { 1: b"a", 2: b"b", b"#3": b"c", 4: b"#d" }) check_good_file( params, "a little of everything", b""";not,1,labels\na,2,1\n;a,3,;a\n";b",4,;b\n'c,5,c'\r\n d,6,' d'\ne,7,e,\n,8,""", { 4: b";b", 6: b" d", 8: b"" }, allow_bad=True, rename={ "a": "0", "2": "ix" }, quotes=True, comment=";", d_bad={ 5: b"'c,5,c'", 7: b"e,7,e," }, d_skipped={ 1: b";not,1,labels", 3: b";a,3,;a" }) check_good_file(params, "skipped lines", b"""just some text\n\nix,0,1\n1,a,a\n2,b,b""", { 1: b"a", 2: b"b" }, skip_lines=2, d_skipped={ 1: b"just some text", 2: b"" }) check_good_file(params, "skipped and bad lines", b"""not data here\nnor here\nix,0,1\n1,a,a\n2,b\n3,c,c""", { 1: b"a", 3: b"c" }, skip_lines=2, allow_bad=True, d_bad={5: b"2,b"}, d_skipped={ 1: b"not data here", 2: b"nor here" }) check_good_file(params, "override labels", b"""a,b,c\n0,foo,foo""", {0: b"foo"}, labels=["ix", "0", "1"]) check_good_file(params, "only labels", b"""ix,0,1""", {}) check_good_file(params, "empty file", b"", {}, labels=["ix", "0", "1"]) bad_lines = [ b"bad,bad", b",", b"bad,", b",bad", b"',',", b"'lo there broken line", b"'nope\"", b"'bad quotes''", b'"bad quote " inside"', b'"more ""bad"" quotes """ inside"', ] good_lines = [ b"\x00", (b"'good, good'", b"good, good"), (b'"also good, yeah!"', b"also good, yeah!"), (b"'single quote''s inside'", b"single quote's inside"), (b"'single quote at end: '''", b"single quote at end: '"), (b'"""double quotes around"""', b'"double quotes around"'), (b'"double quote at end: """', b'double quote at end: "'), (b'" I\'m special "', b" I'm special "), b"I'm not", b" unquoted but with spaces around ", (b"','", b","), b"\x00\xff", b"\xff\x00\x08\x00", (b"'lot''s of ''quotes'' around here: '''''''' '", b"lot's of 'quotes' around here: '''' ") ] check_array(params, good_lines, "strange values.txt", bad_lines, quotes=True) # The lines will be 2 * length + 3 bytes (plus lf) long_lines = [ b"a" * length for length in (64 * 1024 - 2, 999, 999, 1999, 3000, 65000, 8 * 1024 * 1024 - 99) ] check_array(params, long_lines, "long lines.txt") check_bad_file(params, "extra field", b"foo,bar\nwith,extra,field\nok,here\n") check_bad_file(params, "missing field", b"foo,bar\nmissing\nok,here\n") check_bad_file(params, "no valid lines", b"foo\nc,\n") # let's also check some really idiotic combinations for combo in permutations([0, 10, 13, 255], 3): name = "idiotic.%d.%d.%d" % combo sep, newline, comment = (uni(chr(x)) for x in combo) data = [ comment, sep.join(["ix", "0", "1"]), sep.join(["0", "a", "a"]), sep.join([comment + "1", "b", "b"]), sep.join(["2", "", ""]), comment + sep, sep.join(["", "", ""]), sep.join(["4", ",", ","]), comment, ] check_good_file( params, name, data=newline.join(data).encode("iso-8859-1"), d={ 0: b"a", 2: b"", b"": b"", 4: b"," }, d_skipped={ k: data[k - 1].encode("iso-8859-1") for k in (1, 4, 6, 9) }, separator=sep, newline=newline, comment=comment, ) check_no_separator(params)
def do_one(params, name, data): dw = DatasetWriter(name=name, columns=columns) dw.set_slice(0) for v in data: if v is None: d = dict( ascii_new=None, ascii_old=None, bytes_new=None, bytes_old=None, unicode_new=None, unicode_old=None, ) else: d = dict( ascii_new=v, ascii_old=v, bytes_new=uni(v).encode("ascii"), bytes_old=uni(v).encode("ascii"), unicode_new=uni(v), unicode_old=uni(v), ) dw.write_dict(d) # We don't really want the other slices, but write one thing to # each, to make sure it doesn't show up in slice 0. # (Small slice merging will put it in the same file, so this is # a real risk.) for sliceno in range(1, params.slices): dw.set_slice(sliceno) dw.write_dict(d) dw.finish() # verify we got what we asked for me_ds = Dataset(params.jobid, name) for colname, coltype in columns.items(): col = me_ds.columns[colname] assert col.type == coltype.split("_")[-1], colname assert col.backing_type == coltype, colname for want, got in zip(data, me_ds.iterate(0, colname)): if want is not None: if PY2 and "unicode" in coltype: want = uni(want) if PY3 and "bytes" in coltype: want = want.encode("ascii") assert want == got, "%s in %s did not contain the expected value. Wanted %r but got %r." % (colname, me_ds, want, got) # check that both types of bytes filter correctly through typing jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict( column2type=dict( ascii_new="bytes", ascii_old="number", # fails on the string, so that gets filtered out everywhere bytes_new="bytes", bytes_old="bytes", ), filter_bad=True, )) ds = Dataset(jid) # verify the number first data_it = iter(raw_data) next(data_it) # skip the filtered out string for got in ds.iterate(0, "ascii_old"): want = next(data_it) if want is None: # Becomes 0 because the typer (unfortunately) sees it as an empty string want = 0 assert want == got, "ascii_old in %s did not type correctly as number. Wanted %r but got %r." % (ds, want, got) # now verify all the bytes ones are ok, no longer containing the string. for colname in ("ascii_new", "bytes_new", "bytes_old",): data_it = iter(data) next(data_it) # skip the filtered out string for got in ds.iterate(0, colname): want = next(data_it) if want is not None: want = want.encode("ascii") assert want == got, "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got) # and now check that the Nones are ok after making bytes from ascii and unicode from bytes. jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict( column2type=dict( ascii_new="bytes", ascii_old="bytes", bytes_new="unicode:ascii", bytes_old="unicode:ascii", ), )) ds = Dataset(jid) for colname in ("ascii_new", "ascii_old", "bytes_new", "bytes_old",): for want, got in ds.iterate(0, ["unicode_new", colname]): assert uni(want) == uni(got), "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)
def _ds_load(obj): n = unicode(obj) if n not in _ds_cache: _ds_cache[n] = _v2_columntypefix( blob.load(obj._name('pickle'), obj.jobid)) _ds_cache.update(_ds_cache[n].get('cache', ())) return _ds_cache[n] _type_v2to3backing = dict( ascii="_v2_ascii", bytes="_v2_bytes", unicode="_v2_unicode", json="_v2_json", ) _type_v2compattov3t = {v: uni(k) for k, v in _type_v2to3backing.items()} def _dc_v2to3(dc): return _DatasetColumn_3_0( type=dc.type, backing_type=_type_v2to3backing.get(dc.type, dc.type), name=dc.name, location=dc.location, min=dc.min, max=dc.max, offsets=dc.offsets, ) def _v2_columntypefix(ds):
def namefix(d, name): ok = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.' name = ''.join(c if c in ok else '_' for c in uni(name)) while name in d: name += '_' return name
def check_one(params, newline, sep, data, want_res=None, prefix="", quotes=False, leave_bad=False): sep_c = uni(chr(sep)) # Can't have separator character in unquoted values if not quotes and not leave_bad: data = [[el.replace(sep_c, "") for el in line] for line in data] if not want_res: want_res = [ tuple(s.encode("ascii") for s in line) for line in data[1:] ] filename = "%s_csv.%d.%s.txt" % (prefix, sep, "CRLF" if newline == "\r\n" else ord(newline)) newline = uni(newline) with open(filename, "w", encoding="iso-8859-1") as fh: for line in data: if quotes: line = [ quotes + el.replace(quotes, quotes + quotes) + quotes for el in line ] fh.write(sep_c.join(line)) fh.write(newline) try: jid = subjobs.build("csvimport", options=dict( filename=resolve_jobid_filename( params.jobid, filename), separator=sep_c, quotes=quotes, newline='' if "\n" in newline else newline, )) except JobError as e: raise CSVImportException( "Failed to csvimport for separator %d with newline %r, csvimport error was:\n%s" % (sep, newline, e.format_msg())) ds = Dataset(jid) labels = sorted(ds.columns) if labels != data[0]: raise WrongLabelsException( "csvimport gave wrong labels for separator %d with newline %r: %r (expected %r)" % ( sep, newline, labels, data[0], )) res = list(ds.iterate(None, data[0])) if res != want_res: raise WrongDataException( "csvimport gave wrong data for separator %d with newline %r: %r (expected %r)" % ( sep, newline, res, want_res, ))