コード例 #1
0
ファイル: where_test.py プロジェクト: shamisp/arkouda
def run_tests():
    npA = {'int64': np.random.randint(0, 10, SIZE),
           'float64': np.random.randn(SIZE),
           'bool': np.random.randint(0, 2, SIZE, dtype='bool')}
    akA = {k: ak.array(v) for k, v in npA.items()}
    npB = {'int64': np.random.randint(10, 20, SIZE),
           'float64': np.random.randn(SIZE)+10,
           'bool': np.random.randint(0, 2, SIZE, dtype='bool')}
    akB = {k: ak.array(v) for k, v in npB.items()}
    npCond = np.random.randint(0, 2, SIZE, dtype='bool')
    akCond = ak.array(npCond)
    scA = {'int64': 42, 'float64': 2.71828, 'bool': True}
    scB = {'int64': -1, 'float64': 3.14159, 'bool': False}
    dtypes = set(npA.keys())
    failures = 0
    tests = 0
    for dtype in dtypes:
        for (ak1, ak2), (np1, np2) in zip(product((akA, scA), (akB, scB)),
                                          product((npA, scA), (npB, scB))):
            tests += 1
            akres = ak.where(akCond, ak1[dtype], ak2[dtype]).to_ndarray()
            npres = np.where(npCond, np1[dtype], np2[dtype])
            if not np.allclose(akres, npres, equal_nan=True):
                warnings.warn("{} !=\n{}".format(akres, npres))
                failures += 1
    print("{} failures in {} tests".format(failures, tests))
コード例 #2
0
ファイル: hdbscan.py プロジェクト: zhihuidu/arkouda
    def extract_clusters(self):
        # List all the time window keys
        deltas = list(self.cluster_data.keys())
        # Reverse them so we can start with the last clustering data
        deltas.reverse()
        # Ignore delta 0 as it is an artifact of the clustering that isn't used
        deltas = deltas[:-1]

        print("Extracting clusters from each time delta: ".format(deltas))

        # This is list of cluster labels which we will update at each time delta
        # where a value of 0 should indicate an unclustered node
        final_cluster_info = ak.zeros_like(
            self.cluster_data[deltas[0]]['index'])
        # A list of cluster labels that are selected
        selected_clusters = self.selection_data['index'][
            self.selection_data['selected']]
        selected_clusters = selected_clusters[selected_clusters > 0]

        for delta in tqdm(deltas):
            cluster = self.cluster_data[delta]['labels']
            cluster_positive = ak.where(cluster < 0, -cluster, 0)

            # The cluster labels found in this delta
            labels_this_delta = cluster_positive[cluster_positive > 0]

            # A boolean array to indicate which "selected" clusters are labels this delta
            m = ak.in1d(selected_clusters, labels_this_delta)

            # A list of clusters selected for this delta
            extract_this_delta = selected_clusters[m]

            # A boolean array indicating which nodes are in clusters that are extracted this delta
            m2 = ak.in1d(cluster_positive, extract_this_delta)

            # Indicate the clusters for all the nodes in clusters that we extracted this delta
            final_cluster_info[m2] = cluster_positive[m2]
            v, c = ak.value_counts(cluster_positive[m2])
            selected_clusters = selected_clusters[(~m)]

        self.extracted_clusters = final_cluster_info

        if selected_clusters.size > 0:
            print("Failed. {} of the selected clusters remain.".format(
                selected_clusters.size))
            print("Failing cluster labels: {}".format(selected_clusters))

            # We can refer to this list here:
            self.unextracted = selected_clusters
        else:
            print("Extraction completed succesfully.")
コード例 #3
0
    def filter_by_range(self, keys, low=1, high=None):
        """
        Find all rows where the value count of the items in a given set of
        columns (keys) is within the range [low, high].

        To filter by a specific value, set low == high.

        Parameters
        ----------
        keys : list or str
            The names of the columns to group by
        low : int (default=1)
            The lowest value count.
        high : int (default=None)
            The highest value count, default to unlimited.

        Returns
        -------
        pdarray
            An array of boolean values for qualified rows in this DataFrame.

        See Also
        --------
        filter_by_count
        """

        if isinstance(keys, str):
            keys = [keys]
        gb = self.GroupBy(keys, use_series=False)
        vals, cts = gb.count()
        if not high:
            positions = ak.where(cts >= low, 1, 0)
        else:
            positions = ak.where(((cts >= low) & (cts <= high)), 1, 0)

        broadcast = gb.broadcast(positions, permute=False)
        broadcast = (broadcast == 1)
        return broadcast[aku.invert_permutation(gb.permutation)]
コード例 #4
0
 def _convert_strings(self, s):
     '''
     Convert string field names to binary vectors.
     '''
     # Initialize to zero
     values = ak.zeros(s.size, dtype=ak.int64)
     if self.separator == '':
         # When separator is empty, field names are guaranteed to be single characters
         for name, shift in zip(self.names, self.shifts):
             # Check if name exists in each string
             bit = s.contains(name)
             values = values | ak.where(bit, 1 << shift, 0)
     else:
         # When separator is non-empty, split on it
         sf, segs = s.flatten(self.separator, return_segments=True)
         # Create a grouping to map split fields back to originating string
         orig = ak.broadcast(segs, ak.arange(segs.size), sf.size)
         g = ak.GroupBy(orig)
         for name, shift in zip(self.names, self.shifts):
             # Check if name matches one of the split fields from originating string
             bit = g.any(sf == name)[1]
             values = values | ak.where(bit, 1 << shift, 0)
     return values
コード例 #5
0
ファイル: hdbscan.py プロジェクト: Bears-R-Us/arkouda
    def cluster(self, min_cluster_size=5):
        cluster_data = {}
        last_level_delta = self.level_data[0].delta

        # Initial setup; all levels are the same size
        num_nodes = self.level_data[0].size

        # This dataframe holds extraction data
        selection_data = aku.DataFrame({
                'stability': ak.zeros(1, dtype=ak.float64),
                'parent': ak.zeros(1, dtype=ak.int64),
            })

        # Create an initial cluster dataframe
        labels = ak.arange(num_nodes)
        sizes = ak.ones(num_nodes, dtype=ak.int64)
        stability = ak.zeros(num_nodes, dtype=ak.float64)
        selected = ak.zeros(num_nodes, dtype=ak.bool)

        df = aku.DataFrame({
            'cc':self.level_data[0].cc,
            'labels':labels,
            'sizes':sizes,
            'stability':stability,
        })
        # The result should have all the same keys as the deltas
        cluster_data[self.level_data[0].delta] = df

        # We don't start with the level 0, it gets passed through as is.
        for level in tqdm(self.level_data[1:]):
            bylevel = ak.GroupBy(level.cc)
            perm = bylevel.permutation
            # Save for later analysis
            old_labels = labels[:]
            # Count number of nodes in each group
            _,c = bylevel.count()
            # Find largest (negative) label value each group
            _, max_group_labels = bylevel.aggregate(labels, 'min')
            # Find maximum of existing cluster sizes from last iteration.
            _, max_group_size = bylevel.aggregate(sizes, 'max')
            # Find the maximum stability in each group
            _, max_group_stability = bylevel.aggregate(stability, 'max')
            # Find the number of sub-clusters in each group for purposes of creating new cluster labels
            clusters_and_zeros = ak.where(labels < 0, labels, 0)
            _, num_unique_labels = bylevel.aggregate(clusters_and_zeros, 'nunique')
            _, min_group_label = bylevel.aggregate(labels, 'max')
            num_sub_clusters = num_unique_labels - ak.where(min_group_label >= 0, 1, 0)

            # Update sizes
            count_bc = bylevel.broadcast(c, permute=False)
            sizes = ak.zeros(num_nodes, dtype=ak.int64)
            sizes[perm] = count_bc

            # Update labels to max (negative) in group
            labels_bc = bylevel.broadcast(max_group_labels, permute=False)
            labels = ak.zeros(num_nodes, dtype=ak.int64)
            labels[perm] = labels_bc

            # Update stability
            stability_bc = bylevel.broadcast(max_group_stability, permute=False)
            stability = ak.zeros(num_nodes, dtype=ak.float64)
            stability[perm] = stability_bc

            # Create and update labels as needed, baseline size is 1
            # Only need to test if there are at least two cluster labels in a group.
            new_clusters_join = (num_sub_clusters > 1)
            new_clusters_form = ((c >= min_cluster_size) & (max_group_labels >= 0))
            condition = (new_clusters_join | new_clusters_form)
            num_new_labels = int(condition.sum())

            new_labels_positioned = ak.zeros(c.size, dtype=np.int64)
            if num_new_labels > 0:
                # Set up selection_data 
                mn = abs(int(labels.min()))
                new_label_values = ak.arange(mn+1, mn+num_new_labels+1, 1) * (-1)
                new_labels_positioned = ak.zeros(c.size, dtype=np.int64)
                new_labels_positioned[condition] = new_label_values

                # Update selection_data
                update_df = aku.DataFrame({
                    'parent': ak.zeros(num_new_labels, dtype=ak.int64),
                    'stability': ak.zeros(num_new_labels, dtype=ak.float64),
                })
                selection_data.append(update_df)

                # Update the labels
                labels_bc = bylevel.broadcast(new_labels_positioned, permute=False)
                new_labels = ak.zeros(num_nodes, dtype=ak.int64)
                new_labels[perm] = labels_bc
                tmp = ak.where(new_labels < 0, new_labels, labels)
                labels = tmp

                # When clusters become absorbed into new clusters, add their parent labels and update stability
                mask = ((labels < 0) & (old_labels < 0) & (labels < old_labels))
                if mask.sum() > 0:
                    t1 = old_labels[mask]
                    t2 = labels[mask]
                    t3 = stability[mask]
                    bychangedlabels = ak.GroupBy([t1, t2])
                    [old,new] = bychangedlabels.unique_keys
                    # I don't remember the purpose of this line, but it's never used.
                    #stabby = t3[aku.invert_permutation(bychangedlabels.permutation)][bychangedlabels.segments]
                    selection_data['parent'][-1 * old] = -1 * new

            # Set new cluster stability to 0
            new_label_bc = bylevel.broadcast(new_labels_positioned, permute=False)
            tmp = ak.zeros(labels.size, dtype=np.int64)
            tmp[perm] = new_label_bc
            stability[tmp < 0] = 0

            # Update stability
            added_stability = sizes / (level.delta - last_level_delta)
            last_level_delta = level.delta
            tmp = ak.where(sizes >= min_cluster_size, stability + added_stability, stability)
            stability = tmp

            # Save this information after processing
            df = aku.DataFrame({
                'cc':level.cc,
                'labels':labels,
                'sizes':sizes,
                'stability':stability,
            })
            cluster_data[level.delta] = df

            # Update cluster selection information
            bylabel = ak.GroupBy(labels)
            keys = labels[bylabel.permutation][bylabel.segments]
            stab = stability[bylabel.permutation][bylabel.segments]
            indx = (keys[keys < 0])*(-1)
            vals = stab[keys < 0]
            selection_data['stability'][indx] = vals

        # Set up data for next steps
        self.cluster_data = cluster_data
        self.selection_data = selection_data

        # Select and extract
        self.select_clusters()
        self.extract_clusters()

        print("Clustering is complete!")

        return self.extracted_clusters