コード例 #1
0
ファイル: PandasExecutor.py プロジェクト: westernguy2/lux
    def execute_sampling(ldf: LuxDataFrame):
        """
        Compute and cache a sample for the overall dataframe

        - When # of rows exceeds lux.config.sampling_start, take 75% df as sample
        - When # of rows exceeds lux.config.sampling_cap, cap the df at {lux.config.sampling_cap} rows

        lux.config.sampling_start = 100k rows
        lux.config.sampling_cap = 1M rows

        Parameters
        ----------
        ldf : LuxDataFrame
        """
        SAMPLE_FLAG = lux.config.sampling
        SAMPLE_START = lux.config.sampling_start
        SAMPLE_CAP = lux.config.sampling_cap
        SAMPLE_FRAC = 0.75

        if SAMPLE_FLAG and len(ldf) > SAMPLE_CAP:
            if ldf._sampled is None:  # memoize unfiltered sample df
                ldf._sampled = ldf.sample(n=SAMPLE_CAP, random_state=1)
            ldf._message.add_unique(
                f"Large dataframe detected: Lux is only visualizing a sample capped at {SAMPLE_CAP} rows.",
                priority=99,
            )
        elif SAMPLE_FLAG and len(ldf) > SAMPLE_START:
            if ldf._sampled is None:  # memoize unfiltered sample df
                ldf._sampled = ldf.sample(frac=SAMPLE_FRAC, random_state=1)
            ldf._message.add_unique(
                f"Large dataframe detected: Lux is visualizing a sample of {SAMPLE_FRAC}% of the dataframe ({len(ldf._sampled)} rows).",
                priority=99,
            )
        else:
            ldf._sampled = ldf
コード例 #2
0
ファイル: PandasExecutor.py プロジェクト: ccubc/lux
 def execute_sampling(ldf: LuxDataFrame):
     # General Sampling for entire dataframe
     SAMPLE_START = 10000
     SAMPLE_CAP = 30000
     SAMPLE_FRAC = 0.75
     if len(ldf) > SAMPLE_CAP:
         if (ldf._sampled is None):  # memoize unfiltered sample df
             ldf._sampled = ldf.sample(n=SAMPLE_CAP, random_state=1)
         ldf._message.add_unique(
             f"Large dataframe detected: Lux is only visualizing a random sample capped at {SAMPLE_CAP} rows.",
             priority=99)
     elif len(ldf) > SAMPLE_START:
         if (ldf._sampled is None):  # memoize unfiltered sample df
             ldf._sampled = ldf.sample(frac=SAMPLE_FRAC, random_state=1)
         ldf._message.add_unique(
             f"Large dataframe detected: Lux is only visualizing a random sample of {len(ldf._sampled)} rows.",
             priority=99)
     else:
         ldf._sampled = ldf
コード例 #3
0
    def execute(vislist: VisList, ldf: LuxDataFrame):
        '''
        Given a VisList, fetch the data required to render the vis.
        1) Apply filters
        2) Retrieve relevant attribute
        3) Perform vis-related processing (aggregation, binning)
        4) return a DataFrame with relevant results

        Parameters
        ----------
        vislist: list[lux.Vis]
            vis list that contains lux.Vis objects for visualization.
        ldf : lux.core.frame
            LuxDataFrame with specified intent.

        Returns
        -------
        None
        '''
        for vis in vislist:
            vis._vis_data = ldf  # The vis data starts off being the same as the content of the original dataframe
            filter_executed = PandasExecutor.execute_filter(vis)
            # Select relevant data based on attribute information
            attributes = set([])
            for clause in vis._inferred_intent:
                if (clause.attribute):
                    if (clause.attribute != "Record"):
                        attributes.add(clause.attribute)
            # General Sampling
            if len(vis.data) > 10000:
                if (filter_executed):
                    vis._vis_data = vis.data.sample(frac=0.75, random_state=1)
                else:
                    if (ldf._sampled is None):  # memoize unfiltered sample df
                        ldf._sampled = vis.data.sample(frac=0.75,
                                                       random_state=1)
                    vis._vis_data = ldf._sampled
            # TODO: Add some type of cap size on Nrows ?
            vis._vis_data = vis.data[list(attributes)]
            if (vis.mark == "bar" or vis.mark == "line"):
                PandasExecutor.execute_aggregate(vis,
                                                 isFiltered=filter_executed)
            elif (vis.mark == "histogram"):
                PandasExecutor.execute_binning(vis)
            elif (vis.mark == "scatter"):
                if (len(vis.data) > 10000):
                    vis._mark = "heatmap"
                    PandasExecutor.execute_2D_binning(vis)