def compare(self, other, *by, ignore_columns=[], max_changed=inf): """ Find differences against another data frame. `by` are identifier columns which are used to uniquely identify rows and match them between `self` and `other`. `compare` will not work if your data lacks suitable identifiers. `ignore_columns` is an optional list of columns, differences in which to ignore. `compare` returns three data frames: added rows, removed rows and changed values. The first two are basically subsets of the rows of `self` and `other`, respectively. Changed values are returned as a data frame with one row per differing value (not per differing row). Listing changes will terminate once `max_changed` is reached. .. warning:: `compare` is experimental, do not rely on it reporting all of the differences correctly. Do not try to give it two huge data frames with very little in common, unless also giving some sensible value for `max_changed`. >>> old = di.read_csv("data/vehicles.csv") >>> new = old.modify(hwy=lambda x: np.minimum(100, x.hwy)) >>> added, removed, changed = new.compare(old, "id") >>> changed """ if self.unique(*by).nrow < self.nrow: raise ValueError(f"self not unique by {by}") if other.unique(*by).nrow < other.nrow: raise ValueError(f"other not unique by {by}") added = self.anti_join(other, *by) removed = other.anti_join(self, *by) x = self.modify(_i_=range(self.nrow)) y = other.modify(_j_=range(other.nrow)) z = x.inner_join(y.select("_j_", *by), *by) colnames = util.unique_keys(self.colnames + other.colnames) colnames = [x for x in colnames if x not in ignore_columns] changed = [] for i, j in zip(z._i_, z._j_): if len(changed) >= max_changed: print(f"max_changed={max_changed} reached, terminating") break for colname in colnames: if len(changed) >= max_changed: break # XXX: How to make a distinction between # a missing column and a missing value? xvalue = x[colname][i] if colname in x else None yvalue = y[colname][j] if colname in y else None if (xvalue != yvalue and not Vector([xvalue, yvalue]).is_na().all()): # XXX: We could have a name clash here. byrow = {k: x[k][i] for k in by} changed.append(dict(**byrow, column=colname, xvalue=xvalue, yvalue=yvalue)) added = added if added.nrow > 0 else None removed = removed if removed.nrow > 0 else None changed = self.from_json(changed) if changed else None return added, removed, changed
def print_na_counts(self): """ Print counts of missing values by key. Both keys entirely missing and keys with a value of ``None`` are considered missing. >>> data = di.read_json("data/listings.json") >>> data.print_na_counts() """ print("Missing counts:") for key in util.unique_keys(itertools.chain(*self)): n = sum(x.get(key, None) is None for x in self) if n == 0: continue pc = 100 * n / len(self) print(f"... {key}: {n} ({pc:.1f}%)")
def fill_missing_keys(self, **key_value_pairs): """ Return list with missing keys added. If `key_value_pairs` not given, fill all missing keys with ``None``. >>> data = di.read_json("data/listings.json") >>> data = data.fill_missing_keys(price=None) >>> data = data.fill_missing_keys() """ if not key_value_pairs: keys = util.unique_keys(itertools.chain(*self)) key_value_pairs = dict.fromkeys(keys, None) key_value_pairs = key_value_pairs.items() for item in self: for key, value in key_value_pairs: if key not in item: item[key] = value yield item
def from_json(cls, string, *, columns=[], dtypes={}, **kwargs): """ Return a new data frame from JSON `string`. `columns` is an optional list of columns to limit to. `dtypes` is an optional dict mapping column names to NumPy datatypes. `kwargs` are passed to ``json.load``. """ data = string if isinstance(data, str): data = json.loads(data, **kwargs) if not isinstance(data, list): raise TypeError("Not a list") keys = util.unique_keys(itertools.chain(*data)) if columns: keys = [x for x in keys if x in columns] data = {k: [x.get(k, None) for x in data] for k in keys} for name, dtype in dtypes.items(): data[name] = DataFrameColumn(data[name], dtype) return cls(**data)
def rbind(self, *others): """ Return data frame with rows from `others` added. >>> data = di.read_csv("data/listings.csv") >>> data.rbind(data) """ data_frames = [self] + list(others) colnames = util.unique_keys(itertools.chain(*data_frames)) def get_part(data, colname): if colname in data: return data[colname] for ref in data_frames: if colname not in ref: continue value = ref[colname].na_value dtype = ref[colname].na_dtype return Vector.fast([value], dtype).repeat(data.nrow) for colname in colnames: parts = [get_part(x, colname) for x in data_frames] total = DataFrameColumn(np.concatenate(parts)) yield colname, total
def write_csv(self, path, *, encoding="utf-8", header=True, sep=","): """ Write list to CSV file `path`. Will automatically compress if `path` ends in ``.bz2|.gz|.xz``. """ if not self: raise ValueError("Cannot write empty CSV file") # Take a superset of all keys. keys = util.unique_keys(itertools.chain(*self)) util.makedirs_for_file(path) with util.xopen(path, "wt", encoding=encoding) as f: writer = csv.DictWriter(f, keys, dialect="unix", delimiter=sep, quoting=csv.QUOTE_MINIMAL) writer.writeheader() if header else None for item in self: # Fill in missing as None. item = {**dict.fromkeys(keys), **item} writer.writerow(item)
def test_unique_keys(self): assert util.unique_keys([1, 2, 3]) == [1, 2, 3] assert util.unique_keys([1, 2, 3, 1]) == [1, 2, 3]