Ejemplo n.º 1
0
 def _get_head_tail(self):
     if self._empty:
         return pd.DataFrame()
     self.update_size()
     maxrows = pd.get_option('display.max_rows')
     if self._size <= maxrows:
         newdf = aku.DataFrame()
         for col in self._columns:
             if isinstance(self[col], ak.Categorical):
                 newdf[col] = self[col].categories[self[col].codes]
             else:
                 newdf[col] = self[col]
         return newdf.to_pandas(retain_index=True)
     # Being 1 above the threshold caises the PANDAS formatter to split the data frame vertically
     idx = ak.array(
         list(range(maxrows // 2 + 1)) +
         list(range(self._size - (maxrows // 2), self._size)))
     newdf = aku.DataFrame()
     for col in self._columns[1:]:
         if isinstance(self[col], ak.Categorical):
             newdf[col] = self[col].categories[self[col].codes[idx]]
         else:
             newdf[col] = self[col][idx]
     newdf['index'] = self['index'][idx]
     return newdf.to_pandas(retain_index=True)
Ejemplo n.º 2
0
def register_all(data, prefix, overwrite=True):
    def sanitize(k):
        return str(k).replace(' ', '_')

    if overwrite:
        att = attach_all(prefix)
        for k in data:
            ksan = sanitize(k)
            if ksan in att:
                att[ksan].unregister()
    if isinstance(data, dict):
        return {
            k: register(v, f'{prefix}{sanitize(k)}')
            for k, v in data.items()
        }
    elif isinstance(data, aku.DataFrame):
        return aku.DataFrame({
            k: register(v, f'{prefix}{sanitize(k)}')
            for k, v in data.items()
        })
    elif isinstance(data, list):
        return [register(v, f'{prefix}{i}') for i, v in enumerate(data)]
    elif isinstance(data, tuple):
        return tuple([register(v, f'{prefix}{i}') for i, v in enumerate(data)])
    elif isinstance(data, ak.GroupBy):
        data.permutation = register(data.permutation, f'{prefix}permutation')
        data.segments = register(data.segments, f'{prefix}segments')
        data.unique_keys = register_all(data.unique_keys,
                                        f'{prefix}unique_keys_')
        return data
    else:
        raise TypeError(f"Cannot register objects of type {type(data)}")
Ejemplo n.º 3
0
    def copy(self, deep=True):
        """
        Make a copy of this object's data.

        When `deep = True` (default), a new object will be created with a copy of
        the calling object's data. Modifications to the data of the copy will not
        be reflected in the original object.


        When `deep = False` a new object will be created without copying the
        calling object's data. Any changes to the data of the original object will
        be reflected in the shallow copy, and vice versa.

        Parameters
        ----------
        deep : bool (default=True)
            When True, return a deep copy. Otherwise, return a shallow copy.

        Returns
        -------
        aku.DataFrame
            A deep or shallow copy according to caller specification.
        """

        if deep:
            res = DataFrame()
            res._size = self._size
            res._bytes = self._bytes
            res._empty = self._empty
            res._columns = self._columns

            for key, val in self.items():
                res[key] = val[:]

            return res
        else:
            return aku.DataFrame(self.data)
Ejemplo n.º 4
0
    def cluster(self, min_cluster_size=5):
        cluster_data = {}
        last_level_delta = self.level_data[0].delta

        # Initial setup; all levels are the same size
        num_nodes = self.level_data[0].size

        # This dataframe holds extraction data
        selection_data = aku.DataFrame({
                'stability': ak.zeros(1, dtype=ak.float64),
                'parent': ak.zeros(1, dtype=ak.int64),
            })

        # Create an initial cluster dataframe
        labels = ak.arange(num_nodes)
        sizes = ak.ones(num_nodes, dtype=ak.int64)
        stability = ak.zeros(num_nodes, dtype=ak.float64)
        selected = ak.zeros(num_nodes, dtype=ak.bool)

        df = aku.DataFrame({
            'cc':self.level_data[0].cc,
            'labels':labels,
            'sizes':sizes,
            'stability':stability,
        })
        # The result should have all the same keys as the deltas
        cluster_data[self.level_data[0].delta] = df

        # We don't start with the level 0, it gets passed through as is.
        for level in tqdm(self.level_data[1:]):
            bylevel = ak.GroupBy(level.cc)
            perm = bylevel.permutation
            # Save for later analysis
            old_labels = labels[:]
            # Count number of nodes in each group
            _,c = bylevel.count()
            # Find largest (negative) label value each group
            _, max_group_labels = bylevel.aggregate(labels, 'min')
            # Find maximum of existing cluster sizes from last iteration.
            _, max_group_size = bylevel.aggregate(sizes, 'max')
            # Find the maximum stability in each group
            _, max_group_stability = bylevel.aggregate(stability, 'max')
            # Find the number of sub-clusters in each group for purposes of creating new cluster labels
            clusters_and_zeros = ak.where(labels < 0, labels, 0)
            _, num_unique_labels = bylevel.aggregate(clusters_and_zeros, 'nunique')
            _, min_group_label = bylevel.aggregate(labels, 'max')
            num_sub_clusters = num_unique_labels - ak.where(min_group_label >= 0, 1, 0)

            # Update sizes
            count_bc = bylevel.broadcast(c, permute=False)
            sizes = ak.zeros(num_nodes, dtype=ak.int64)
            sizes[perm] = count_bc

            # Update labels to max (negative) in group
            labels_bc = bylevel.broadcast(max_group_labels, permute=False)
            labels = ak.zeros(num_nodes, dtype=ak.int64)
            labels[perm] = labels_bc

            # Update stability
            stability_bc = bylevel.broadcast(max_group_stability, permute=False)
            stability = ak.zeros(num_nodes, dtype=ak.float64)
            stability[perm] = stability_bc

            # Create and update labels as needed, baseline size is 1
            # Only need to test if there are at least two cluster labels in a group.
            new_clusters_join = (num_sub_clusters > 1)
            new_clusters_form = ((c >= min_cluster_size) & (max_group_labels >= 0))
            condition = (new_clusters_join | new_clusters_form)
            num_new_labels = int(condition.sum())

            new_labels_positioned = ak.zeros(c.size, dtype=np.int64)
            if num_new_labels > 0:
                # Set up selection_data 
                mn = abs(int(labels.min()))
                new_label_values = ak.arange(mn+1, mn+num_new_labels+1, 1) * (-1)
                new_labels_positioned = ak.zeros(c.size, dtype=np.int64)
                new_labels_positioned[condition] = new_label_values

                # Update selection_data
                update_df = aku.DataFrame({
                    'parent': ak.zeros(num_new_labels, dtype=ak.int64),
                    'stability': ak.zeros(num_new_labels, dtype=ak.float64),
                })
                selection_data.append(update_df)

                # Update the labels
                labels_bc = bylevel.broadcast(new_labels_positioned, permute=False)
                new_labels = ak.zeros(num_nodes, dtype=ak.int64)
                new_labels[perm] = labels_bc
                tmp = ak.where(new_labels < 0, new_labels, labels)
                labels = tmp

                # When clusters become absorbed into new clusters, add their parent labels and update stability
                mask = ((labels < 0) & (old_labels < 0) & (labels < old_labels))
                if mask.sum() > 0:
                    t1 = old_labels[mask]
                    t2 = labels[mask]
                    t3 = stability[mask]
                    bychangedlabels = ak.GroupBy([t1, t2])
                    [old,new] = bychangedlabels.unique_keys
                    # I don't remember the purpose of this line, but it's never used.
                    #stabby = t3[aku.invert_permutation(bychangedlabels.permutation)][bychangedlabels.segments]
                    selection_data['parent'][-1 * old] = -1 * new

            # Set new cluster stability to 0
            new_label_bc = bylevel.broadcast(new_labels_positioned, permute=False)
            tmp = ak.zeros(labels.size, dtype=np.int64)
            tmp[perm] = new_label_bc
            stability[tmp < 0] = 0

            # Update stability
            added_stability = sizes / (level.delta - last_level_delta)
            last_level_delta = level.delta
            tmp = ak.where(sizes >= min_cluster_size, stability + added_stability, stability)
            stability = tmp

            # Save this information after processing
            df = aku.DataFrame({
                'cc':level.cc,
                'labels':labels,
                'sizes':sizes,
                'stability':stability,
            })
            cluster_data[level.delta] = df

            # Update cluster selection information
            bylabel = ak.GroupBy(labels)
            keys = labels[bylabel.permutation][bylabel.segments]
            stab = stability[bylabel.permutation][bylabel.segments]
            indx = (keys[keys < 0])*(-1)
            vals = stab[keys < 0]
            selection_data['stability'][indx] = vals

        # Set up data for next steps
        self.cluster_data = cluster_data
        self.selection_data = selection_data

        # Select and extract
        self.select_clusters()
        self.extract_clusters()

        print("Clustering is complete!")

        return self.extracted_clusters
Ejemplo n.º 5
0
    def concat( arrays, axis=0,index_labels=None, value_labels=None):
        """Concatenate in arkouda a list of arkouda Series or grouped arkouda arrays horizontally or vertically.
      
        If a list of grouped arkouda arrays is passed they are converted to a series. Each grouping is a 2-tuple 
        with the first item being the key(s) and the second being the value.

        If horizontal, each series or grouping must have the same length and the same index. The index of the series is 
        converted to a column in the dataframe.  If it is a multi-index,each level is converted to a column.

        Parameters
        ----------    
        arrays:  The list of series/groupings to concat.
        axis  :  Whether or not to do a verticle (axis=0) or horizontal (axis=1) concatenation
        index_labels:  column names(s) to label the index.
        value_labels:  column names to label values of each series.

        Returns
        -------
        axis=0: an arkouda series.
        axis=1: an arkouda dataframe.
        """

        if len(arrays) == 0:
            raise IndexError("Array length must be non-zero")

        if type(next(iter(arrays))) == tuple:
            arrays = [ Series(i) for i in arrays]

        if axis == 1:
            # Horizontal concat
            if value_labels == None:
                value_labels = [ "val_{}".format(i) for i in range(len(arrays))]
                
            if Series._all_aligned(arrays):

                data = next(iter(arrays)).index.to_dict(index_labels)

                for col,label in zip(arrays,value_labels):
                    data [ str(label) ] = col.values
                    
            else:
                aitor = iter(arrays)
                idx = next(aitor).index ;
                idx = idx._merge_all([ i.index for i in aitor])

                data = idx.to_dict(index_labels)
                
                for col,label in zip(arrays,value_labels):
                    data[str(label)] = aku.lookup( col.index.index, col.values, idx.index, fillvalue=0)

            retval =  aku.DataFrame( data) 
        else:
            # Verticle concat
            idx = arrays[0].index 
            v   = arrays[0].values
            for other in arrays[1:]:
                idx = idx.concat(other.index)
                v = ak.concatenate( [v,other.values], ordered=True )
            retval = aku.Series ( (idx,v))
            
        return retval
Ejemplo n.º 6
0
def read_checkpointed(files,
                      filterfunc=None,
                      prior_data=None,
                      prefix=None,
                      chunksize=1000,
                      checkpoint=0,
                      clear=False,
                      asframe=True,
                      converters={},
                      strictTypes=False,
                      **kwargs):
    '''Read files in chunks in a recoverable fashion, optionally appending to existing 
    data and/or performing aggressive memory conservation. If initial data argument is
    supplied, any chunks read will be available to the user regardless of errors in
    later chunks. Function can be called again from checkpoint to resume reading.

    Parameters
    ----------
    files : list
        List of filenames to read
    filterfunc : function
        Function that accepts a data dictionary and returns a boolean array indicating
        which rows of data to keep. By default, no filtering is performed.
    data : dict of pdarray
        Initial data dictionary, to which new data will be appended in-place. Even if
        errors occur, chunks that are successfully read will exist in data.
    prefix : str
        Prefix with which to register data arrays in arkouda. Can be used with
        ak.attach_pdarray() to recover data.
    chunksize : int
        Number of files to read in each chunk
    checkpoint : int
        Index in files list for restarting. If an error occurs, the message will
        specify the checkpoint value to use.
    clear : bool
        If True (default: False), call ak.clear() after reading each chunk. This will
        aggressively conserve memory by deleting all arrays that have not been
        registered. WARNING: before using this option, be sure to register all
        non-temporary arrays!
    asframe : bool
        If False, return a dictionary of arkouda arrays. By default, return 
        a DataFrame.
    converters : dict-like
        A mapping of column name to function that will be called on that column
        after it is read. If a column is not present, no error is raised.
    kwargs
        Passed to ak.read_all()

    Returns
    -------
    data : dict of pdarray
        Dictionary emulating a dataframe of all data in files that passes filter
    '''
    if prefix is None and clear:
        raise ValueError(
            "Must supply a registration prefix (prefix=) with clear=True")
    if prior_data is None:
        data = {}
    else:
        data = prior_data
    if len(data) == 0:
        size = 0
    else:
        size = list(data.values())[0].size
    for i in range(checkpoint, len(files), chunksize):
        try:
            print(f'Reading files {i}:{min((len(files), i + chunksize))}')
            chunk = _read_chunk(files,
                                i,
                                i + chunksize,
                                filterfunc=filterfunc,
                                strictTypes=strictTypes,
                                **kwargs)
            s = list(chunk.values())[0].size
            print(f'{s:,} records read')
        except Exception as e:
            raise RuntimeError(
                f'Error encountered: restart with checkpoint={i}') from e
        if len(data) > 0:
            if (set(chunk.keys()) != set(data.keys())):
                raise ValueError(
                    f"Incompatible chunk: mismatched columns: {chunk.keys()} vs. {data.keys()}"
                )
            for k in chunk:
                # Append to the data dict in-place
                # In-place update ensures data survives any errors raised
                data[k] = ak.concatenate((data[k], chunk[k]))
        else:
            for k in chunk:
                # Update the data dict in-place
                data[k] = chunk[k]
        if prefix is not None:
            data = aku.register_all(data, prefix=prefix)
        size += s
        if clear:
            # Clear to stay under memory ceiling
            ak.clear()
        if i > checkpoint:
            print(f'{size:,} total records')
    for col, convert in converters.items():
        if col in data:
            data[col] = convert(data[col])
    if asframe:
        data = aku.DataFrame(data)
    return data