def _get_head_tail(self): if self._empty: return pd.DataFrame() self.update_size() maxrows = pd.get_option('display.max_rows') if self._size <= maxrows: newdf = aku.DataFrame() for col in self._columns: if isinstance(self[col], ak.Categorical): newdf[col] = self[col].categories[self[col].codes] else: newdf[col] = self[col] return newdf.to_pandas(retain_index=True) # Being 1 above the threshold caises the PANDAS formatter to split the data frame vertically idx = ak.array( list(range(maxrows // 2 + 1)) + list(range(self._size - (maxrows // 2), self._size))) newdf = aku.DataFrame() for col in self._columns[1:]: if isinstance(self[col], ak.Categorical): newdf[col] = self[col].categories[self[col].codes[idx]] else: newdf[col] = self[col][idx] newdf['index'] = self['index'][idx] return newdf.to_pandas(retain_index=True)
def register_all(data, prefix, overwrite=True): def sanitize(k): return str(k).replace(' ', '_') if overwrite: att = attach_all(prefix) for k in data: ksan = sanitize(k) if ksan in att: att[ksan].unregister() if isinstance(data, dict): return { k: register(v, f'{prefix}{sanitize(k)}') for k, v in data.items() } elif isinstance(data, aku.DataFrame): return aku.DataFrame({ k: register(v, f'{prefix}{sanitize(k)}') for k, v in data.items() }) elif isinstance(data, list): return [register(v, f'{prefix}{i}') for i, v in enumerate(data)] elif isinstance(data, tuple): return tuple([register(v, f'{prefix}{i}') for i, v in enumerate(data)]) elif isinstance(data, ak.GroupBy): data.permutation = register(data.permutation, f'{prefix}permutation') data.segments = register(data.segments, f'{prefix}segments') data.unique_keys = register_all(data.unique_keys, f'{prefix}unique_keys_') return data else: raise TypeError(f"Cannot register objects of type {type(data)}")
def copy(self, deep=True): """ Make a copy of this object's data. When `deep = True` (default), a new object will be created with a copy of the calling object's data. Modifications to the data of the copy will not be reflected in the original object. When `deep = False` a new object will be created without copying the calling object's data. Any changes to the data of the original object will be reflected in the shallow copy, and vice versa. Parameters ---------- deep : bool (default=True) When True, return a deep copy. Otherwise, return a shallow copy. Returns ------- aku.DataFrame A deep or shallow copy according to caller specification. """ if deep: res = DataFrame() res._size = self._size res._bytes = self._bytes res._empty = self._empty res._columns = self._columns for key, val in self.items(): res[key] = val[:] return res else: return aku.DataFrame(self.data)
def cluster(self, min_cluster_size=5): cluster_data = {} last_level_delta = self.level_data[0].delta # Initial setup; all levels are the same size num_nodes = self.level_data[0].size # This dataframe holds extraction data selection_data = aku.DataFrame({ 'stability': ak.zeros(1, dtype=ak.float64), 'parent': ak.zeros(1, dtype=ak.int64), }) # Create an initial cluster dataframe labels = ak.arange(num_nodes) sizes = ak.ones(num_nodes, dtype=ak.int64) stability = ak.zeros(num_nodes, dtype=ak.float64) selected = ak.zeros(num_nodes, dtype=ak.bool) df = aku.DataFrame({ 'cc':self.level_data[0].cc, 'labels':labels, 'sizes':sizes, 'stability':stability, }) # The result should have all the same keys as the deltas cluster_data[self.level_data[0].delta] = df # We don't start with the level 0, it gets passed through as is. for level in tqdm(self.level_data[1:]): bylevel = ak.GroupBy(level.cc) perm = bylevel.permutation # Save for later analysis old_labels = labels[:] # Count number of nodes in each group _,c = bylevel.count() # Find largest (negative) label value each group _, max_group_labels = bylevel.aggregate(labels, 'min') # Find maximum of existing cluster sizes from last iteration. _, max_group_size = bylevel.aggregate(sizes, 'max') # Find the maximum stability in each group _, max_group_stability = bylevel.aggregate(stability, 'max') # Find the number of sub-clusters in each group for purposes of creating new cluster labels clusters_and_zeros = ak.where(labels < 0, labels, 0) _, num_unique_labels = bylevel.aggregate(clusters_and_zeros, 'nunique') _, min_group_label = bylevel.aggregate(labels, 'max') num_sub_clusters = num_unique_labels - ak.where(min_group_label >= 0, 1, 0) # Update sizes count_bc = bylevel.broadcast(c, permute=False) sizes = ak.zeros(num_nodes, dtype=ak.int64) sizes[perm] = count_bc # Update labels to max (negative) in group labels_bc = bylevel.broadcast(max_group_labels, permute=False) labels = ak.zeros(num_nodes, dtype=ak.int64) labels[perm] = labels_bc # Update stability stability_bc = bylevel.broadcast(max_group_stability, permute=False) stability = ak.zeros(num_nodes, dtype=ak.float64) stability[perm] = stability_bc # Create and update labels as needed, baseline size is 1 # Only need to test if there are at least two cluster labels in a group. new_clusters_join = (num_sub_clusters > 1) new_clusters_form = ((c >= min_cluster_size) & (max_group_labels >= 0)) condition = (new_clusters_join | new_clusters_form) num_new_labels = int(condition.sum()) new_labels_positioned = ak.zeros(c.size, dtype=np.int64) if num_new_labels > 0: # Set up selection_data mn = abs(int(labels.min())) new_label_values = ak.arange(mn+1, mn+num_new_labels+1, 1) * (-1) new_labels_positioned = ak.zeros(c.size, dtype=np.int64) new_labels_positioned[condition] = new_label_values # Update selection_data update_df = aku.DataFrame({ 'parent': ak.zeros(num_new_labels, dtype=ak.int64), 'stability': ak.zeros(num_new_labels, dtype=ak.float64), }) selection_data.append(update_df) # Update the labels labels_bc = bylevel.broadcast(new_labels_positioned, permute=False) new_labels = ak.zeros(num_nodes, dtype=ak.int64) new_labels[perm] = labels_bc tmp = ak.where(new_labels < 0, new_labels, labels) labels = tmp # When clusters become absorbed into new clusters, add their parent labels and update stability mask = ((labels < 0) & (old_labels < 0) & (labels < old_labels)) if mask.sum() > 0: t1 = old_labels[mask] t2 = labels[mask] t3 = stability[mask] bychangedlabels = ak.GroupBy([t1, t2]) [old,new] = bychangedlabels.unique_keys # I don't remember the purpose of this line, but it's never used. #stabby = t3[aku.invert_permutation(bychangedlabels.permutation)][bychangedlabels.segments] selection_data['parent'][-1 * old] = -1 * new # Set new cluster stability to 0 new_label_bc = bylevel.broadcast(new_labels_positioned, permute=False) tmp = ak.zeros(labels.size, dtype=np.int64) tmp[perm] = new_label_bc stability[tmp < 0] = 0 # Update stability added_stability = sizes / (level.delta - last_level_delta) last_level_delta = level.delta tmp = ak.where(sizes >= min_cluster_size, stability + added_stability, stability) stability = tmp # Save this information after processing df = aku.DataFrame({ 'cc':level.cc, 'labels':labels, 'sizes':sizes, 'stability':stability, }) cluster_data[level.delta] = df # Update cluster selection information bylabel = ak.GroupBy(labels) keys = labels[bylabel.permutation][bylabel.segments] stab = stability[bylabel.permutation][bylabel.segments] indx = (keys[keys < 0])*(-1) vals = stab[keys < 0] selection_data['stability'][indx] = vals # Set up data for next steps self.cluster_data = cluster_data self.selection_data = selection_data # Select and extract self.select_clusters() self.extract_clusters() print("Clustering is complete!") return self.extracted_clusters
def concat( arrays, axis=0,index_labels=None, value_labels=None): """Concatenate in arkouda a list of arkouda Series or grouped arkouda arrays horizontally or vertically. If a list of grouped arkouda arrays is passed they are converted to a series. Each grouping is a 2-tuple with the first item being the key(s) and the second being the value. If horizontal, each series or grouping must have the same length and the same index. The index of the series is converted to a column in the dataframe. If it is a multi-index,each level is converted to a column. Parameters ---------- arrays: The list of series/groupings to concat. axis : Whether or not to do a verticle (axis=0) or horizontal (axis=1) concatenation index_labels: column names(s) to label the index. value_labels: column names to label values of each series. Returns ------- axis=0: an arkouda series. axis=1: an arkouda dataframe. """ if len(arrays) == 0: raise IndexError("Array length must be non-zero") if type(next(iter(arrays))) == tuple: arrays = [ Series(i) for i in arrays] if axis == 1: # Horizontal concat if value_labels == None: value_labels = [ "val_{}".format(i) for i in range(len(arrays))] if Series._all_aligned(arrays): data = next(iter(arrays)).index.to_dict(index_labels) for col,label in zip(arrays,value_labels): data [ str(label) ] = col.values else: aitor = iter(arrays) idx = next(aitor).index ; idx = idx._merge_all([ i.index for i in aitor]) data = idx.to_dict(index_labels) for col,label in zip(arrays,value_labels): data[str(label)] = aku.lookup( col.index.index, col.values, idx.index, fillvalue=0) retval = aku.DataFrame( data) else: # Verticle concat idx = arrays[0].index v = arrays[0].values for other in arrays[1:]: idx = idx.concat(other.index) v = ak.concatenate( [v,other.values], ordered=True ) retval = aku.Series ( (idx,v)) return retval
def read_checkpointed(files, filterfunc=None, prior_data=None, prefix=None, chunksize=1000, checkpoint=0, clear=False, asframe=True, converters={}, strictTypes=False, **kwargs): '''Read files in chunks in a recoverable fashion, optionally appending to existing data and/or performing aggressive memory conservation. If initial data argument is supplied, any chunks read will be available to the user regardless of errors in later chunks. Function can be called again from checkpoint to resume reading. Parameters ---------- files : list List of filenames to read filterfunc : function Function that accepts a data dictionary and returns a boolean array indicating which rows of data to keep. By default, no filtering is performed. data : dict of pdarray Initial data dictionary, to which new data will be appended in-place. Even if errors occur, chunks that are successfully read will exist in data. prefix : str Prefix with which to register data arrays in arkouda. Can be used with ak.attach_pdarray() to recover data. chunksize : int Number of files to read in each chunk checkpoint : int Index in files list for restarting. If an error occurs, the message will specify the checkpoint value to use. clear : bool If True (default: False), call ak.clear() after reading each chunk. This will aggressively conserve memory by deleting all arrays that have not been registered. WARNING: before using this option, be sure to register all non-temporary arrays! asframe : bool If False, return a dictionary of arkouda arrays. By default, return a DataFrame. converters : dict-like A mapping of column name to function that will be called on that column after it is read. If a column is not present, no error is raised. kwargs Passed to ak.read_all() Returns ------- data : dict of pdarray Dictionary emulating a dataframe of all data in files that passes filter ''' if prefix is None and clear: raise ValueError( "Must supply a registration prefix (prefix=) with clear=True") if prior_data is None: data = {} else: data = prior_data if len(data) == 0: size = 0 else: size = list(data.values())[0].size for i in range(checkpoint, len(files), chunksize): try: print(f'Reading files {i}:{min((len(files), i + chunksize))}') chunk = _read_chunk(files, i, i + chunksize, filterfunc=filterfunc, strictTypes=strictTypes, **kwargs) s = list(chunk.values())[0].size print(f'{s:,} records read') except Exception as e: raise RuntimeError( f'Error encountered: restart with checkpoint={i}') from e if len(data) > 0: if (set(chunk.keys()) != set(data.keys())): raise ValueError( f"Incompatible chunk: mismatched columns: {chunk.keys()} vs. {data.keys()}" ) for k in chunk: # Append to the data dict in-place # In-place update ensures data survives any errors raised data[k] = ak.concatenate((data[k], chunk[k])) else: for k in chunk: # Update the data dict in-place data[k] = chunk[k] if prefix is not None: data = aku.register_all(data, prefix=prefix) size += s if clear: # Clear to stay under memory ceiling ak.clear() if i > checkpoint: print(f'{size:,} total records') for col, convert in converters.items(): if col in data: data[col] = convert(data[col]) if asframe: data = aku.DataFrame(data) return data