예제 #1
0
    def train(self, df, targets, conditions):
        # Obtain the targets column.
        if len(targets) != 1:
            raise BLE(
                ValueError('OrbitalMechanics can only target one '
                           'column. Received {}'.format(targets)))
        if targets[0][1].lower() != 'numerical':
            raise BLE(
                ValueError('OrbitalMechanics can only target a NUMERICAL '
                           'column. Received {}'.format(targets)))
        self.targets = [targets[0][0]]

        # Obtain the conditions column.
        if len(conditions) != 2:
            raise BLE(
                ValueError('OrbitalMechanics can only condition on '
                           'two columns. Received {}'.format(conditions)))
        if any(c[1].lower() != 'numerical' for c in conditions):
            raise BLE(
                ValueError(
                    'OrbitalMechanics can only condition on '
                    'NUMERICAL columns. Received {}'.format(conditions)))
        self.conditions = [c[0] for c in conditions]

        # The dataset.
        self.dataset = df[self.conditions + self.targets].dropna()
        X = self.dataset[self.conditions].as_matrix()

        # Learn the noise model.
        actual_period = self.dataset[self.targets].as_matrix().ravel()
        theoretical_period = satellite_period_minutes(X[:, 0], X[:, 1])
        errors = np.abs(actual_period - theoretical_period)
        error_95 = np.percentile(errors, 95)
        errors = np.mean(np.select([errors < error_95], [errors]))
        self.noise = np.sqrt(np.mean(errors**2))
예제 #2
0
    def gen_hilight_colors(hl_labels=None, hl_colors=None):
        """Generates a hilight color lookup from labels to colors. Generates
        labels from Set1 by default.
        """
        if hl_labels is None:
            return {}

        hl_colors_out = {}
        if hl_colors is None:
            if len(hl_labels) > 0:
                hl_max = float(len(hl_labels))
                hl_colors_out = dict([(i, plt.cm.Set1(i/hl_max))
                    for i in hl_labels])
        else:
            if isinstance(hl_colors, list):
                hl_colors_out = dict(zip(hl_labels, hl_colors))
            elif isinstance(hl_colors, dict):
                for key in hl_colors.keys():
                    if key not in hl_labels:
                        raise BLE(ValueError(
                            'hl_colors dict must have an entry '
                            'for each hl_label.'))
                hl_colors_out = hl_colors
            else:
                raise BLE(TypeError('hl_colors must be a list or dict.'))
            if len(hl_colors) != len(hl_labels):
                raise BLE(ValueError('hl_colors must have an entry for each '
                    'entry in hl_labels.'))

        return hl_colors_out
    def train(self, df, targets, conditions):
        # Obtain the targets column.
        if len(targets) != 1:
            raise BLE(
                ValueError('MultipleRegression requires at least one column '
                           'in targets. Received {}'.format(targets)))
        if targets[0][1].lower() != 'numerical':
            raise BLE(
                ValueError('MultipleRegression can only regress NUMERICAL '
                           'columns. Received {}'.format(targets)))
        self.targets = [targets[0][0]]

        # Obtain the condition columns.
        if len(conditions) < 1:
            raise BLE(
                ValueError(
                    'MultipleRegression requires at least one '
                    'column in conditions. Received {}'.format(conditions)))
        self.conditions_categorical = []
        self.conditions_numerical = []
        for c in conditions:
            if c[1].lower() == 'categorical':
                self.conditions_categorical.append(c[0])
            else:
                self.conditions_numerical.append(c[0])
        self.conditions = self.conditions_numerical + \
            self.conditions_categorical

        # The dataset.
        self.dataset = pd.DataFrame()
        # Lookup for categoricals to code.
        self.categories_to_val_map = dict()
        # Training set (regressors and labels)
        self.X_numerical = np.ndarray(0)
        self.X_categorical = np.ndarray(0)
        self.Y = np.ndarray(0)
        # Linear regressors.
        self.mr_partial = LinearRegression()
        self.mr_full = LinearRegression()

        # Preprocess the data.
        self.dataset = utils.extract_sklearn_dataset(self.conditions,
                                                     self.targets, df)
        self.categories_to_val_map = utils.build_categorical_to_value_map(
            self.conditions_categorical, self.dataset)
        self.X_categorical = utils.extract_sklearn_features_categorical(
            self.conditions_categorical, self.categories_to_val_map,
            self.dataset)
        self.X_numerical = utils.extract_sklearn_features_numerical(
            self.conditions_numerical, self.dataset)
        self.Y = utils.extract_sklearn_univariate_target(
            self.targets, self.dataset)
        # Train the multiple regression.
        self._train_mr()
예제 #4
0
 def train(self, df, targets, conditions):
     # Obtain the targets column.
     if len(targets) != 1:
         raise BLE(
             ValueError('RandomForest requires exactly one column in '
                        'targets. Received {}'.format(targets)))
     if targets[0][1].lower() != 'categorical':
         raise BLE(
             ValueError('RandomForest can only classify CATEGORICAL '
                        'columns. Received {}'.format(targets)))
     self.targets = [targets[0][0]]
     # Obtain the condition columns.
     if len(conditions) < 1:
         raise BLE(
             ValueError('RandomForest requires at least one column in '
                        'conditions. Received {}'.format(conditions)))
     self.conditions_categorical = []
     self.conditions_numerical = []
     for c in conditions:
         if c[1].lower() == 'categorical':
             self.conditions_categorical.append(c[0])
         else:
             self.conditions_numerical.append(c[0])
     self.conditions = self.conditions_numerical + \
         self.conditions_categorical
     # The dataset.
     self.dataset = pd.DataFrame()
     # Lookup for categoricals to code.
     self.categories_to_val_map = dict()
     # Training set (regressors and labels)
     self.X_numerical = np.ndarray(0)
     self.X_categorical = np.ndarray(0)
     self.Y = np.ndarray(0)
     # Random Forests.
     self.rf_partial = RandomForestClassifier(n_estimators=100)
     self.rf_full = RandomForestClassifier(n_estimators=100)
     # Preprocess the data.
     self.dataset = utils.extract_sklearn_dataset(self.conditions,
                                                  self.targets, df)
     self.categories_to_val_map = utils.build_categorical_to_value_map(
         self.conditions_categorical, self.dataset)
     self.X_categorical = utils.extract_sklearn_features_categorical(
         self.conditions_categorical, self.categories_to_val_map,
         self.dataset)
     self.X_numerical = utils.extract_sklearn_features_numerical(
         self.conditions_numerical, self.dataset)
     self.Y = utils.extract_sklearn_univariate_target(
         self.targets, self.dataset)
     # Train the random forest.
     self._train_rf()
예제 #5
0
    def _compute_targets_distribution(self, conditions):
        """Given conditions dict {feature_col:val}, returns the
        distribution and (class mapping for lookup) of the random label
        self.targets|conditions.
        """
        if not set(self.conditions).issubset(set(conditions.keys())):
            raise BLE(
                ValueError('Must specify values for all the conditionals.\n'
                           'Received: {}\n'
                           'Expected: {}'.format(
                               conditions, self.conditions_numerical +
                               self.conditions_categorical)))

        # Are there any category values in conditions which never appeared during
        # training? If yes, we need to run the partial RF.
        unseen = any([
            conditions[cat] not in self.categories_to_val_map[cat]
            for cat in self.conditions_categorical
        ])

        X_numerical = [conditions[col] for col in self.conditions_numerical]
        if unseen:
            distribution = self.rf_partial.predict_proba(X_numerical)
            classes = self.rf_partial.classes_
        else:
            X_categorical = [
                conditions[col] for col in self.conditions_categorical
            ]
            X_categorical = utils.binarize_categorical_row(
                self.conditions_categorical, self.categories_to_val_map,
                X_categorical)
            distribution = self.rf_full.predict_proba(
                np.hstack((X_numerical, X_categorical)))
            classes = self.rf_partial.classes_
        return distribution[0], classes
예제 #6
0
def pairplot_vars(bdb,
                  varnames,
                  colorby=None,
                  generator_name=None,
                  population_name=None,
                  **kwargs):
    """Use pairplot to show the given variables.

  See help(pairplot) for more plot options.

  Parameters
  ----------
  bdb: __population_to_bdb__
  varnames: list of one or more variables to plot.
  generator_name: __generator_name__
  population_name: __population_name__
  colorby: categorical variable to color all of the plots by.

  Returns
  -------
  figure: a matplotlib.figure.Figure
  """
    if len(varnames) < 1:
        raise BLE(ValueError('Pairplot at least one variable.'))
    qvars = varnames if colorby is None else set(varnames + [colorby])
    query_columns = '''"%s"''' % '''", "'''.join(qvars)
    bql = '''SELECT %s FROM %s''' % (query_columns, population_name)
    df = bqlu.query(bdb, bql)
    return pairplot(bdb,
                    df,
                    generator_name=generator_name,
                    colorby=colorby,
                    **kwargs)
예제 #7
0
 def initialize_session_capture(self, name):
     if self.session_capture_name is not None:
         return
     if name is not None:
         self.session_capture_name = name
         return
     # Search for a session-capture name or opt-out saved as a file:
     searchdir = os.getcwd()
     while searchdir != os.path.dirname(searchdir):  # While not at root.
         try:
             with open(os.path.join(searchdir, OPTFILE), 'r') as optinfile:
                 self.session_capture_name = optinfile.read()
                 if self.session_capture_name == 'False':
                     self.session_capture_name = False
                 break
         except IOError:
             pass
         searchdir = os.path.dirname(searchdir)
     # No init option specified, no choice file found. Force the choice.
     if self.session_capture_name is None:
         raise BLE(
             "Please set session_capture_name option to Population.__init__\n"
             "  to either opt-in or opt-out of sending details of your usage of\n"
             "  this software to the MIT Probabilistic Computing Group.\n\n"
             "If you see this in one of our example notebooks,\n"
             "  return to the starting page, the Index.ipynb, to\n"
             "  make that choice.")
예제 #8
0
def barplot(bdb, df):
    """Make bar-plot from categories and their heights.

    First column specifies names; second column specifies heights.

    Parameters
    ----------
    bdb : __population_to_bdb__
    df : __specifier_to_df__

    Returns
    ----------
    figure: matplotlib.figure.Figure
    """
    if df.shape[1] != 2:
        raise BLE(
            ValueError('Need two columns of output from SELECT for barplot.'))
    height_inches = df.shape[0] / 2.0
    figure, ax = plt.subplots(figsize=(height_inches, 5))

    ax.bar([x - .5 for x in range(df.shape[0])],
           df.ix[:, 1].values,
           color='#333333',
           edgecolor='#333333')
    ax.set_xticks(range(df.shape[0]))
    ax.set_xticklabels(df.ix[:, 0].values, rotation=90)
    ax.set_xlim([-1, df.shape[0] - .5])
    ax.set_ylabel(df.columns[1])
    ax.set_xlabel(df.columns[0])

    return figure
예제 #9
0
def draw_crosscat(bdb, generator, modelno, row_label_col=None):
    """Draw crosscat model from the specified generator.

    Parameters
    ----------
    bdb : bayeslite.BayesDB
        Active BayesDB instance.
    generator_name : str
        Name of generator.
    modelno: int
        Number of model to draw.

    Returns
    ----------
    figure: matplotlib.figure.Figure
    """
    bql = '''
        SELECT tabname, metamodel FROM bayesdb_generator
            WHERE name = ?
    '''
    cursor = bdb.execute(bql, (generator,))
    table_name, metamodel = cursor.next()

    if metamodel.lower() != 'crosscat':
        raise BLE(ValueError(
            'Metamodel for generator %s (%s) should be crosscat' %
            (generator, metamodel)))

    figure, axes = plt.subplots(tight_layout=False)
    draw_state(bdb, table_name, generator,
        modelno, ax=axes, row_label_col=row_label_col)

    return figure
예제 #10
0
def do_hist(data_srs, **kwargs):
    ax = kwargs.get('ax', None)
    bdb = kwargs.get('bdb', None)
    dtype = kwargs.get('dtype', None)
    show_contour = kwargs.get('show_contour', None)
    generator_name = kwargs.get('generator_name', None)
    colors = kwargs.get('colors', None)

    if dtype is None:
        dtype = get_bayesdb_col_type(data_srs.columns[0],
                                     data_srs,
                                     bdb=bdb,
                                     generator_name=generator_name)

    if ax is None:
        ax = plt.gca()

    if len(data_srs.shape) > 1:
        if colors is None and data_srs.shape[1] != 1:
            raise BLE(
                ValueError('If a dummy column is specified,'
                           ' colors must also be specified.'))

    data_srs = data_srs.dropna()

    if dtype == 'categorical':
        vals, uvals, _ = conv_categorical_vals_to_numeric(data_srs.ix[:, 0])
        if colors is not None:
            color_lst = []
            stacks = []
            for val, color in colors.iteritems():
                subval = vals[data_srs.ix[:, 1].values == val]
                color_lst.append(color)
                stacks.append(subval)
            ax.hist(stacks,
                    bins=len(uvals),
                    color=color_lst,
                    alpha=.9,
                    histtype='barstacked',
                    rwidth=1.0)
        else:
            ax.hist(vals, bins=len(uvals))
        ax.set_xticks(range(len(uvals)))
        ax.set_xticklabels(uvals)
    else:
        do_kde = show_contour
        if colors is not None:
            for val, color in colors.iteritems():
                subdf = data_srs.loc[data_srs.ix[:, 1] == val]
                values = drop_inf_and_nan(subdf.ix[:, 0])
                if len(values) < 2:  # Then seaborn would break. :-p
                    continue
                bins = seaborn_broken_bins(values)
                sns.distplot(values, kde=do_kde, ax=ax, color=color, bins=bins)
        else:
            values = drop_inf_and_nan(data_srs)
            bins = seaborn_broken_bins(values)
            sns.distplot(values, kde=do_kde, ax=ax, bins=bins)

    return ax
예제 #11
0
 def logpdf(self, value, conditions):
     if not set(self.conditions).issubset(set(conditions.keys())):
         raise BLE(
             ValueError('Must specify values for all the conditionals.\n'
                        'Received: {}\n'
                        'Expected: {}'.format(conditions, self.conditions)))
     apogee_km, perigee_km = self._conditions(conditions)
     period_minutes = satellite_period_minutes(apogee_km, perigee_km)
     return logpdfGaussian(value, period_minutes, self.noise)
예제 #12
0
def gen_collapsed_legend_from_dict(hl_colors_dict,
                                   loc=0,
                                   title=None,
                                   fontsize='medium',
                                   wrap_threshold=1000):
    """Creates a legend with entries grouped by color.

    For example, if a plot has multiple labels associated with the same color
    line, instead of generating a legend entry for each label, labels with the
    same colored line will be collapsed into longer, comma-separated labels.

    Parameters
    ----------
    hl_colors_dict : dict
        A dict of label, color pairs. Colors can be strings e.g. 'deeppink' or
        rgb or rgba tuples.
    loc : matplotlib compatible
        any matpltotlib-compbatible legend location identifier
    title : str
        legend title
    fontsize : int
        legend entry and title fontsize
    wrap_threshold : int
        max number of characters before wordwrap

    Returns
    -------
    legend : matplotlib.legend
    """
    if not isinstance(hl_colors_dict, dict):
        raise BLE(TypeError("hl_colors_dict must be a dict"))

    colors = list(set(hl_colors_dict.values()))
    from collections import defaultdict
    collapsed_dict = defaultdict(list)

    for label, color in hl_colors_dict.iteritems():
        collapsed_dict[color].append(str(label))

    for color in collapsed_dict.keys():
        collapsed_dict[color] = "\n".join(
            wrap(", ".join(sorted(collapsed_dict[color])), wrap_threshold))

    legend_artists = []
    legend_labels = []
    for color, label in collapsed_dict.iteritems():
        legend_artists.append(plt.Line2D((0, 1), (0, 0), color=color, lw=3))
        legend_labels.append(label)

    legend = plt.legend(legend_artists,
                        legend_labels,
                        loc=loc,
                        title=title,
                        fontsize=fontsize)

    return legend
예제 #13
0
def plot_crosscat_chain_diagnostics(bdb, diagnostic, generator):
    """Plot diagnostics for all models of generator.

    Parameters
    ----------
    bdb : bayeslite.BayesDB
        Active BayesDB instance.
    diagnostic : str
        Valid (crosscat) diagnostics are:
            - logscore: log score of the model
            - num_views: the number of views in the model
            - column_crp_alpha: CRP alpha over columns
    generator : str
        Name of the generator to diagnose.

    Returns
    ----------
    figure: matplotlib.figure.Figure
    """
    valid_diagnostics = ['logscore', 'num_views', 'column_crp_alpha']
    if diagnostic not in valid_diagnostics:
        raise BLE(ValueError('Unknown diagnostic %s.\n'
            'Please choose one of the following instead: %s\n'
            % ', '.join(valid_diagnostics)))

    generator_id = bayeslite.core.bayesdb_get_generator(bdb, generator)

    # Get model numbers. Do not rely on there to be a diagnostic for every
    # model.
    bql = '''
        SELECT modelno, COUNT(modelno) FROM bayesdb_crosscat_diagnostics
            WHERE generator_id = ? GROUP BY modelno
    '''
    df = bu.cursor_to_df(bdb.execute(bql, (generator_id,)))
    models = df['modelno'].astype(int).values

    figure, ax = plt.subplots(tight_layout=True, figsize=(10, 5))
    colors = sns.color_palette("GnBu_d", len(models))
    for i, modelno in enumerate(models):
        bql = '''
            SELECT {}, iterations FROM bayesdb_crosscat_diagnostics
                WHERE modelno = ? AND generator_id = ?
                ORDER BY iterations ASC
        '''.format(diagnostic)
        df = bu.cursor_to_df(bdb.execute(bql, (modelno, generator_id)))
        ax.plot(df['iterations'].values, df[diagnostic].values,
                 c=colors[modelno], alpha=.7, lw=2)

        ax.text(df['iterations'].values[-1], df[diagnostic].values[-1],
                str(modelno), color=colors[i])

    ax.set_xlabel('Iteration')
    ax.set_ylabel(diagnostic)
    ax.set_title('%s for each model in %s' % (diagnostic, generator,))

    return figure
예제 #14
0
 def simulate(self, n_samples, conditions):
     if not set(self.conditions).issubset(set(conditions.keys())):
         raise BLE(
             ValueError('Must specify values for all the conditionals.\n'
                        'Received: {}\n'
                        'Expected: {}'.format(conditions, self.conditions)))
     apogee_km, perigee_km = self._conditions(conditions)
     period_minutes = satellite_period_minutes(apogee_km, perigee_km)
     return list(period_minutes +
                 self.prng.normal(scale=self.noise, size=n_samples))
예제 #15
0
 def initialize(self):
     if self.bdb:
         self.check_representation()
         return
     self.bdb = bayeslite.bayesdb_open(self.bdb_path)
     if not bayeslite.core.bayesdb_has_table(self.bdb, self.name):
         if self.df is not None:
             bayeslite.read_pandas.bayesdb_read_pandas_df(self.bdb,
                                                          self.name,
                                                          self.df,
                                                          create=True,
                                                          ifnotexists=True)
         elif self.csv_path:
             bayeslite.bayesdb_read_csv_file(self.bdb,
                                             self.name,
                                             self.csv_path,
                                             header=True,
                                             create=True,
                                             ifnotexists=True)
         else:
             tables = self.list_tables()
             metamodels = self.list_metamodels()
             if len(tables) + len(metamodels) == 0:
                 raise BLE(
                     ValueError(
                         "No data sources specified, and an empty bdb."))
             else:
                 raise BLE(
                     ValueError(
                         "The name of the population must be the same"
                         " as a table in the bdb, one of: " +
                         ", ".join(tables) +
                         "\nNote also that the bdb has the following"
                         " metamodels defined: " + ", ".join(metamodels)))
     self.generators = self.query('''SELECT * FROM bayesdb_generator''')
     if len(self.generators) == 0:
         size = self.query('''SELECT COUNT(*) FROM %t''').ix[0, 0]
         assert 0 < size
         self.query('''
     CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''')
     self.check_representation()
예제 #16
0
def get_metadata(bdb, generator_name, modelno):
    generator_id = bayeslite.core.bayesdb_get_generator(bdb, generator_name)
    sql = '''
        SELECT theta_json FROM bayesdb_crosscat_theta
            WHERE generator_id = ? and modelno = ?
    '''
    cursor = bdb.sql_execute(sql, (generator_id, modelno))
    try:
        row = cursor.next()
    except StopIteration:
        raise BLE(ValueError('Could not find generator with '
            'name {}, or incorrect model number.'.format(generator_name)))
    else:
        return json.loads(row[0])
예제 #17
0
def get_M_c(bdb, generator_name):
    generator_id = bayeslite.core.bayesdb_get_generator(bdb, generator_name)
    sql = '''
        SELECT metadata_json FROM bayesdb_crosscat_metadata
            WHERE generator_id = ?
    '''
    cursor = bdb.sql_execute(sql, (generator_id,))
    try:
        row = cursor.next()
    except StopIteration:
        raise BLE(ValueError(bdb, 'No crosscat metadata for generator: %s'
            % (generator_name,)))
    else:
        return json.loads(row[0])
예제 #18
0
def quick_explore_vars(self, varnames, plotfile=None, nsimilar=20):
    """Show dependence probabilities and neighborhoods based on those.

  varnames: list of strings
      At least two column names to look at dependence probabilities of,
      and to explore neighborhoods of.
  nsimilar: positive integer
      The size of the neighborhood to explore.
  """
    if len(varnames) < 2:
        raise BLE(ValueError('Need to explore at least two variables.'))
    self.pairplot_vars(varnames)
    query_columns = '''"%s"''' % '''", "'''.join(varnames)
    with self.bdb.savepoint():
        temptab = self.bdb.temp_table_name()
        self.query('''
      CREATE TEMP TABLE %s AS
        ESTIMATE DEPENDENCE PROBABILITY
          FROM PAIRWISE COLUMNS OF %s FOR %s
    ''' % (temptab, self.generator_name, query_columns))
        deps = self.query('SELECT * FROM %s' % (temptab, ))
        deps.columns = ['genid', 'name0', 'name1', 'value']
        triangle = self.query('''
      SELECT * FROM %s WHERE name0 < name1 ORDER BY value DESC
    ''' % (temptab, ))
        triangle.columns = ['genid', 'name0', 'name1', 'value']
    if plotfile:
        self.logger.plot(plotfile + '-deps', self.heatmap(deps))
    self.logger.result("Pairwise dependence probability for: %s\n%s\n\n",
                       query_columns, triangle)

    for col in varnames:
        neighborhood = self.query(
            '''ESTIMATE *, DEPENDENCE PROBABILITY WITH "%s"
       AS "Probability of Dependence with %s"
       FROM COLUMNS OF %s
       ORDER BY "Probability of Dependence with %s"
       DESC LIMIT %d;''' % (col, col, self.generator_name, col, nsimilar))
        neighbor_columns = ('''"%s"''' %
                            '''", "'''.join(neighborhood["name"].tolist()))
        deps = self.query('''ESTIMATE DEPENDENCE PROBABILITY
        FROM PAIRWISE COLUMNS OF %s
        FOR %s;''' % (self.generator_name, neighbor_columns))
        deps.columns = ['genid', 'name0', 'name1', 'value']
        if plotfile:
            self.logger.plot(plotfile + "-" + col, self.heatmap(deps))
        self.logger.result(
            "Pairwise dependence probability of %s with its " +
            "strongest dependents:\n%s\n\n", col, neighborhood)
예제 #19
0
def variable_stattypes(bdb, generator_name=None):
    assert generator_name
    """The modeled statistical types of each variable in order."""
    if not bayeslite.core.bayesdb_has_generator_default(bdb, generator_name):
        raise BLE(NameError('No such generator {}'.format(generator_name)))
    sql = '''
        SELECT c.colno AS colno, c.name AS name,
                gc.stattype AS stattype
            FROM bayesdb_generator AS g,
                (bayesdb_column AS c LEFT OUTER JOIN
                    bayesdb_generator_column AS gc
                    USING (colno))
            WHERE g.id = ? AND g.id = gc.generator_id
                AND g.tabname = c.tabname
            ORDER BY colno ASC;
    '''
    generator_id = bayeslite.core.bayesdb_get_generator_default(
        bdb, generator_name)
    curs = bdb.sql_execute(sql, bindings=(generator_id, ))
    return cursor_to_df(curs)
예제 #20
0
def describe_generator(bdb, generator_name):
    """Returns a DataFrame containing description of `generator_name`.

    Examples
    --------

    >>> bdbcontrib.describe_generator(bdb, 'employees_gen')
    id |          name |   tabname | metamodel
    ---+---------------+-----------+----------
    3  | employees_gen | employees |  crosscat
    """
    if not bayeslite.core.bayesdb_has_generator_default(bdb, generator_name):
        raise BLE(NameError('No such generator {}'.format(generator_name)))
    sql = '''
            SELECT id, name, tabname, metamodel
                FROM bayesdb_generator
                WHERE name = ?
        '''
    curs = bdb.sql_execute(sql, bindings=(generator_name, ))
    return cursor_to_df(curs)
    def _compute_targets_distribution(self, conditions):
        """Given conditions dict {feature_col:val}, returns the conditional
        mean of the `targets`, and the scale of the Gaussian noise.
        """
        if not set(self.conditions).issubset(set(conditions.keys())):
            raise BLE(
                ValueError('Must specify values for all the conditionals.\n'
                           'Received: {}\n'
                           'Expected: {}'.format(
                               conditions, self.conditions_numerical +
                               self.conditions_categorical)))

        # Are there any category values in conditions which never appeared during
        # training? If yes, we need to run the partial RF.
        unseen = any([
            conditions[cat] not in self.categories_to_val_map[cat]
            for cat in self.conditions_categorical
        ])

        X_numerical = [conditions[col] for col in self.conditions_numerical]

        if unseen:
            inputs = np.array([X_numerical])
            assert inputs.shape == (1, len(self.conditions_numerical))
            predictions = self.mr_partial.predict(inputs)
            noise = self.mr_partial_noise
        else:
            X_categorical = [
                conditions[col] for col in self.conditions_categorical
            ]
            X_categorical = utils.binarize_categorical_row(
                self.conditions_categorical, self.categories_to_val_map,
                X_categorical)
            inputs = np.concatenate(([X_numerical], [X_categorical]), axis=1)
            assert inputs.shape == \
                (1, len(self.conditions_numerical) + len(X_categorical))
            predictions = self.mr_full.predict(inputs)
            noise = self.mr_full_noise

        return predictions[0], noise
예제 #22
0
def describe_generator_models(bdb, generator_name):
    """Returns a DataFrame containing description of the models
    in `generator_name`.

    Examples
    --------

    >>> bdbcontrib.describe_generator_models(bdb, 'employees_gen')
    modelno | iterations
    --------+-----------
          0 | 100
    """
    if not bayeslite.core.bayesdb_has_generator_default(bdb, generator_name):
        raise BLE(NameError('No such generator {}'.format(generator_name)))
    sql = '''
        SELECT modelno, iterations FROM bayesdb_generator_model
            WHERE generator_id = ?
        '''
    generator_id = bayeslite.core.bayesdb_get_generator_default(
        bdb, generator_name)
    curs = bdb.sql_execute(sql, bindings=(generator_id, ))
    return cursor_to_df(curs)
예제 #23
0
def get_bayesdb_col_type(column_name,
                         df_column,
                         bdb=None,
                         generator_name=None):
    # If column_name is a column label (not a short name!) then the modeltype
    # of the column will be returned otherwise we guess.

    if isinstance(df_column, pd.DataFrame):
        raise BLE(
            TypeError(
                'Multiple columns in the query result have the same name (%s).'
                % (column_name, )))

    def guess_column_type(df_column):
        pd_type = df_column.dtype
        if pd_type is str or pd_type == np.object:
            return 'categorical'
        else:
            if len(df_column.unique()) < 30:
                return 'categorical'
            else:
                return 'numerical'

    if bdb is not None and generator_name is not None:
        try:
            coltype = bqlu.get_column_stattype(bdb, generator_name,
                                               column_name)
            # XXX: Force cyclic -> numeric because there is no need to plot
            # cyclic data any differently until we implement rose plots. See
            # http://matplotlib.org/examples/pie_and_polar_charts/polar_bar_demo.html
            # for an example.
            if coltype.lower() == 'cyclic':
                coltype = 'numerical'
            return coltype
        except IndexError:
            return guess_column_type(df_column)
    else:
        return guess_column_type(df_column)
예제 #24
0
def describe_table(bdb, table_name):
    """Returns a DataFrame containing description of `table_name`.

    Examples
    --------
    >>> bdbcontrib.describe_table(bdb, 'employees')
    tabname   | colno |    name
    ----------+-------+--------
    employees |     0 |    name
    employees |     1 |     age
    employees |     2 |  weight
    employees |     3 |  height
    """
    if not bayeslite.core.bayesdb_has_table(bdb, table_name):
        raise BLE(NameError('No such table {}'.format(table_name)))
    sql = '''
        SELECT tabname, colno, name
            FROM bayesdb_column
            WHERE tabname=?
            ORDER BY tabname ASC, colno ASC
        '''
    curs = bdb.sql_execute(sql, bindings=(table_name, ))
    return cursor_to_df(curs)
예제 #25
0
def quick_similar_rows(self, identify_row_by, nsimilar=10):
    """Explore rows similar to the identified one.

  identify_row_by : dict
      Dictionary of column names to their values. These will be turned into
      a WHERE clause in BQL, and must identify one unique row.
  nsimilar : positive integer
      The number of similar rows to retrieve.
  """
    import hashlib
    table_name = 'tmptbl_' + hashlib.md5('\x00'.join(
        [repr(identify_row_by), str(self.status)])).hexdigest()
    column_name = 'similarity_to_' + "__".join(
        re.sub(r'\W', '_', str(val)) for val in identify_row_by.values())
    query_params = []
    query_columns = []
    for k, v in identify_row_by.iteritems():
        query_columns.append('''%s = ? ''' % bayeslite.bql_quote_name(k))
        query_params.append(v)
    query_attrs = ' and '.join(query_columns)

    with self.bdb.savepoint():
        row_exists = self.query('SELECT COUNT(*) FROM %s WHERE %s;' %
                                (self.name, query_attrs))
        if row_exists.ix[0][0] != 1:
            raise BLE(
                NotImplementedError(
                    'identify_row_by found %d rows instead of exactly 1 in %s.'
                    % (row_exists.ix[0][0], self.csv_path)))
        creation_query = ('''CREATE TEMP TABLE IF NOT EXISTS %s AS ESTIMATE *,
                         SIMILARITY TO (%s) AS %s FROM %%g LIMIT %d;''' %
                          (table_name, query_attrs, column_name, nsimilar))
        self.query(creation_query, query_params)
        result = self.query('''SELECT * FROM %s ORDER BY %s DESC;''' %
                            (table_name, column_name))
    return result
예제 #26
0
def _pairplot(df,
              bdb=None,
              generator_name=None,
              stattypes=None,
              show_contour=False,
              colorby=None,
              show_missing=False,
              show_full=False,
              **kwargs):
    """Plots the columns in data_df in a facet grid.

    Supports the following pairs:
    - categorical-categorical pairs are displayed as a heatmap
    - continuous-continuous pairs are displayed as a kdeplot
    - categorical-continuous pairs are displayed on a violin plot

    Parameters
    ----------
    df : pandas.DataFrame
        The input data---the result of a BQL/SQL query
    bdb : bayeslite.BayesDB (optional)
        The BayesDB object associated with `df`. Having the BayesDB object and
        the generator for the data allows pairplot to choose plot types.
    generator_name : str
        The name of generator associated with `df` and `bdb`.
    stattypes : dict, optional {column_name: "categorical"|"numerical"}
        If you do not specify a generator name, have a column that the
        generator doesn't know about, or would like to override the
        statistical types the generator has for a given variable, then pass
        this dict of column names to types.
    show_contour : bool
        If True, KDE contours are plotted on top of scatter plots
        and histograms.
    show_missing : bool
        If True, rows with one missing value are plotted as lines on scatter
        plots.
    colorby : str
        Name of a column to use to color data points in histograms and scatter
        plots.
    show_full : bool
        Show full pairwise plots, rather than only lower triangular plots.
    kwargs : dict
        Options to pass through to underlying plotting function (for pairs).

    Returns
    -------
    figure : matplotlib.figure.Figure
        A num_columns by num_columns Gridspec of pairplot axes.

    Notes
    -----
    Support soon for ordered continuous combinations. It may be best
    to plot all ordered continuous pairs as heatmap.
    """
    # NOTE:Things to consider:
    # - String values are a possibility (categorical)
    # - who knows what the columns are named. What is the user selects columns
    #   as shortname?
    # - where to handle dropping NaNs? Missing values may be informative.

    data_df = df
    if stattypes is None:
        stattypes = {}

    colors = None
    if colorby is not None:
        n_colorby = 0
        for colname in data_df.columns:
            # XXX: This is not guaranteed to work on all Unicode characters.
            if colorby.lower() == colname.lower():
                n_colorby += 1
                colorby = colname
        if n_colorby == 0:
            raise BLE(
                ValueError('colorby column, {}, not found.'.format(colorby)))
        elif n_colorby > 1:
            raise BLE(ValueError(
                'Multiple columns named, {}.'.format(colorby)))

        dummy = data_df[colorby].dropna()
        dvals = np.sort(dummy.unique())
        ndvals = len(dvals)
        dval_type = "categorical"
        if colorby in stattypes:
            dval_type = stattypes[colorby]
        elif generator_name:
            dval_type = get_bayesdb_col_type(colorby,
                                             dummy,
                                             bdb=bdb,
                                             generator_name=generator_name)
        if dval_type.lower() != 'categorical':
            raise BLE(ValueError('colorby columns must be categorical.'))
        cmap = sns.color_palette("Set1", ndvals)
        colors = {}
        for val, color in zip(dvals, cmap):
            colors[val] = color

    all_varnames = [c for c in data_df.columns if c != colorby]
    n_vars = len(all_varnames)
    plt_grid = gridspec.GridSpec(n_vars, n_vars)
    figure = plt.figure()

    # if there is only one variable, just do a hist
    if n_vars == 1:
        ax = plt.gca()
        varname = data_df.columns[0]
        vartype = "categorical"
        if varname in stattypes:
            vartype = stattypes[varname]
        elif generator_name:
            vartype = get_bayesdb_col_type(varname,
                                           data_df[varname],
                                           bdb=bdb,
                                           generator_name=generator_name)
        do_hist(data_df, dtype=vartype, ax=ax, bdb=bdb, colors=colors)
        rotate_tick_labels(ax)
        return figure

    xmins = np.ones((n_vars, n_vars)) * float('Inf')
    xmaxs = np.ones((n_vars, n_vars)) * float('-Inf')
    ymins = np.ones((n_vars, n_vars)) * float('Inf')
    ymaxs = np.ones((n_vars, n_vars)) * float('-Inf')

    vartypes = []
    for varname in all_varnames:
        vartype = "categorical"
        if varname in stattypes:
            vartype = stattypes[varname]
        elif generator_name:
            vartype = get_bayesdb_col_type(varname,
                                           data_df[varname],
                                           bdb=bdb,
                                           generator_name=generator_name)
        vartypes.append(vartype)

    # store each axes; reaccessing ax with plt.subplot(plt_grid[a,b]) may
    # overwrite the ax
    axes = [[] for _ in xrange(len(all_varnames))]
    for x_pos, var_name_x in enumerate(all_varnames):
        var_x_type = vartypes[x_pos]
        for y_pos, var_name_y in enumerate(all_varnames):
            var_y_type = vartypes[y_pos]

            ax = figure.add_subplot(plt_grid[y_pos, x_pos])
            axes[y_pos].append(ax)

            if x_pos == y_pos:
                varnames = [var_name_x]
                if colorby is not None:
                    varnames.append(colorby)
                ax = do_hist(data_df[varnames],
                             dtype=var_x_type,
                             ax=ax,
                             bdb=bdb,
                             colors=colors,
                             show_contour=show_contour)
            else:
                varnames = [var_name_x, var_name_y]
                vartypes_pair = (
                    var_x_type,
                    var_y_type,
                )
                if colorby is not None:
                    varnames.append(colorby)
                plot_df = prep_plot_df(data_df, varnames)
                ax = do_pair_plot(plot_df,
                                  vartypes_pair,
                                  ax=ax,
                                  bdb=bdb,
                                  show_contour=show_contour,
                                  show_missing=show_missing,
                                  colors=colors,
                                  **kwargs)

                ymins[y_pos, x_pos] = ax.get_ylim()[0]
                ymaxs[y_pos, x_pos] = ax.get_ylim()[1]
                xmins[y_pos, x_pos] = ax.get_xlim()[0]
                xmaxs[y_pos, x_pos] = ax.get_xlim()[1]

            ax.set_xlabel(var_name_x, fontweight='bold')
            ax.set_ylabel(var_name_y, fontweight='bold')

    for x_pos in range(n_vars):
        for y_pos in range(n_vars):
            ax = axes[y_pos][x_pos]

            # Self-histogram for x only, or comparative x against y:
            ax.set_xlim([np.min(xmins[:, x_pos]), np.max(xmaxs[:, x_pos])])
            if x_pos != y_pos:
                ax.set_ylim([np.min(ymins[y_pos, :]), np.max(ymaxs[y_pos, :])])

            # Y labels
            if x_pos > 0:
                if x_pos == n_vars - 1:  # All the way to the right:
                    ax.yaxis.tick_right()
                    ax.yaxis.set_label_position('right')
                else:  # No labels inside:
                    ax.set_ylabel('')
                    ax.set_yticklabels([])
            else:
                ax.yaxis.tick_left()
                ax.yaxis.set_label_position('left')

            # X labels:
            if y_pos < n_vars - 1:
                if y_pos == 0:  # At top, show x labels on top:
                    ax.xaxis.tick_top()
                    ax.xaxis.set_label_position('top')
                else:  # No labels inside:
                    ax.set_xlabel('')
                    ax.set_xticklabels([])
            else:
                ax.xaxis.tick_bottom()
                ax.xaxis.set_label_position('bottom')

            rotate_tick_labels(ax)

    def fake_axis_ticks(ax_tl, ax_tn):
        atl, btl = ax_tl.get_ylim()
        atn, btn = ax_tn.get_ylim()
        tnticks = ax_tn.get_yticks()
        yrange_tn = (btn - atn)
        yrange_tl = (btl - atl)
        tntick_ratios = [(t - atn) / yrange_tn for t in tnticks]
        ax_tl.set_yticks([r * yrange_tl + atl for r in tntick_ratios])
        ax_tl.set_yticklabels(tnticks)

    # Fix the top-left histogram y-axis ticks and labels.
    if show_full:
        fake_axis_ticks(axes[0][0], axes[0][1])
    fake_axis_ticks(axes[-1][-1], axes[-1][0])

    if colorby is not None:
        legend = gen_collapsed_legend_from_dict(colors, title=colorby)
        legend.draggable()

    # tril by default by deleting upper diagonal axes.
    if not show_full:
        for y_pos in range(n_vars):
            for x_pos in range(y_pos + 1, n_vars):
                figure.delaxes(axes[y_pos][x_pos])

    return figure
예제 #27
0
def draw_state(bdb, table_name, generator_name, modelno,
        ax=None, border_width=3, row_label_col=None, short_names=True,
        hilight_rows=[], hilight_rows_colors=None,
        hilight_cols=[], hilight_cols_colors=None,
        separator_color='black', separator_width=4,
        blank_state=False, nan_color=(1., 0., 0., 1.),
        view_labels=None, view_label_fontsize='large',
        legend=True, legend_fontsize='medium',
        row_legend_loc=1, row_legend_title='Row key',
        col_legend_loc=4, col_legend_title='Column key',
        descriptions_in_legend=True, legend_wrap_threshold=20,):
    """Creates a debugging (read: not pretty) rendering of a CrossCat state.

    Parameters
    ----------
    bdb : bayeslite.BayesDB
        The BayesDB object associated with the CrossCat state
    table_name : str
        The btable name containing the data associated with the state
    generator_name : str
        The CrossCat generator associated witht the state
    modelno : int
        The index of the model/state to draw
    ax : matplotlib.axis
        The axis on which to draw
    row_label_col : str
        The name of the column to use for row labels. Defaults to FIXME
    short_names : bool
        Use shortnames as column labels
    hilight_rows : list<str>
        A list of rows to hilight with colored rectangles.
    hilight_rows_colors : list
        Contains a color (str or tuple) for each entry in `hilight_rows`. If
        not specified, unique colors for each entry are generated.
    hilight_cols : list
        A list of columns to hilight with colored rectangles.
    hilight_cols_colors : list
        Contains a color (str or tuple) for each entry in `hilight_cols`. If
        not specified, unique colors for each entry are generated.
    blank_state : bool
        If True, draws an unsorted, unpartitioned state
    view_labels : list<str>
        Labels placed above each view. If `len(view_labels) < num_views` then
        only the views for which there are entries are labeled.
    view_label_fontsize : valid matplotlib `fontsize`
        Font size used for vie labels
    legend : bool
        If True (defult) displays legend
    legend_fontsize : valid matplotlib `fontsize`
        Font size used for legend entries and titles
    row_legend_loc : matplotlib.legend location
        location of the row legend. For use with row hilighting
    col_legend_loc : matplotlib.legend location
        location of the column legend. For use with column hilighting

    Returns
    -------
    ax : matplotlib.axis
        The state rendering

    Other Parameters
    ----------------
    border_width : int
        The number of cells between views. Use larger values for longer row
        names.
    separator_color : str or (r, g, b) or (r, g, b, alpha) tuple
        The color of the cluster seprator. Default is black.
    separator_width : int
        linewidth of the cluster separator
    nan_color : str or (r, g, b) or (r, g, b, alpha) tuple
        The color for missing/NaN values. Default is red.
    row_legend_title : str
        title of the row legend
    col_legend_title : str
        title of the column legend
    legend_wrap_threshold : int
        Max number of characters until wordrap for collapsed legends. For use
        when multiple entries in `hilight_cols_colors` or `hilight_cols_colors`
        contain the same color.
    descriptions_in_legend : bool
        If True (default), the column descriptions (requires codebook) are
        added to the legend
    """
    theta = get_metadata(bdb, generator_name, modelno)
    M_c = get_M_c(bdb, generator_name)
    # idx_to_name doesn't use an int idx, but a string idx because
    # crosscat.  Yep.
    ordered_columns = [M_c['idx_to_name'][str(idx)] for
                       idx in sorted(M_c['name_to_idx'].values())]
    T = bu.get_data_as_list(bdb, table_name, column_list=ordered_columns)
    X_L = theta['X_L']
    X_D = theta['X_D']

    num_rows = len(T)
    num_cols = len(T[0])

    if not blank_state:
        sortedstate = DrawStateUtils.sort_state(X_L, X_D, M_c, T)
        sorted_views, sorted_clusters, sorted_cols, sorted_rows = sortedstate
        column_partition = X_L['column_partition']['assignments']
    else:
        blankstate = DrawStateUtils.gen_blank_sort(num_rows, num_cols)
        sorted_views, sorted_clusters, sorted_cols, sorted_rows = blankstate
        column_partition = [0]*num_cols

    if view_labels is not None:
        if not isinstance(view_labels, list):
            raise BLE(TypeError("view_labels must be a list"))
        if len(view_labels) != len(sorted_views):
            view_labels += ['']*(len(sorted_rows)-len(view_labels))
    else:
        view_labels = ['V ' + str(i) for i in range(num_rows)]

    if hilight_cols_colors is None:
        hilight_cols_colors = []

    if hilight_rows_colors is None:
        hilight_rows_colors = []

    # set colormap to 50% gray (should probably give the user control
    # over this)
    cmap = matplotlib.colors.ListedColormap([(1, 1, 1, 1), (1, 1, 1, 1)])
    T = DrawStateUtils.convert_t_do_numerical(T, M_c)

    num_views = len(sorted_cols)
    X = np.zeros((num_rows, num_cols+num_views*border_width))

    # row hilighting
    row_hl_colors = DrawStateUtils.gen_hilight_colors(hilight_rows,
        hilight_rows_colors)

    hl_row_idx_label_zip = []
    if row_label_col is None:
        row_labels = [str(i) for i in range(num_rows)]
    elif isinstance(row_label_col, list):
        if len(row_label_col) != num_rows:
            raise BLE(TypeError("If row_label_col is a list, it must have an "
                                "entry for each row"))
        row_labels = [str(label) for label in row_label_col]
    elif isinstance(row_label_col, str):
        # FIXME: This is not going to work until BayesDB stops removing key and
        # ignore columns from the data
        raise NotImplementedError
        label_col_idx = M_c['name_to_idx'][row_label_col]
        row_labels = [str(T[row, label_col_idx]) for row in range(num_rows)]
    else:
        raise BLE(TypeError("Unhandled row_label_col type {}.".format(
            type(row_label_col))))

    row_idx_to_label = {}
    row_label_to_idx = {}
    for row, label in enumerate(row_labels):
        # XXX: Allows missing enries to be a column label
        row_idx_to_label[row] = label
        row_label_to_idx[label] = row

    for label in hilight_rows:
        hl_row_idx_label_zip.append((row_label_to_idx[label], label,))

    # column hilighting
    col_hl_colors = DrawStateUtils.gen_hilight_colors(hilight_cols,
        hilight_cols_colors)

    hl_col_idx_label_zip = []
    for label in hilight_cols:
        hl_col_idx_label_zip.append((M_c['name_to_idx'][label], label,))

    # generate a heatmap using the data (allows clusters to ahve different
    # base colors)
    cell_colors = DrawStateUtils.gen_cell_colors(T, sorted_views, sorted_cols,
        sorted_clusters, sorted_rows, column_partition, cmap, border_width,
        nan_color=nan_color)

    # x_tick_labels = []
    x_labels = []

    if ax is None:
        ax = plt.gca()

    ax.imshow(cell_colors, cmap=cmap, interpolation='nearest', origin='upper',
              aspect='auto')
    col_count = 0
    for v, view in enumerate(sorted_views):
        view_x_labels = [M_c['idx_to_name'][str(col)]
                         for col in sorted_cols[view]]
        if short_names:
            view_x_tick_labels = bu.get_shortnames(bdb, table_name,
                view_x_labels)
        else:
            view_x_tick_labels = view_x_labels

        y_tick_labels = []

        x_labels += view_x_labels + ['_']*border_width
        num_cols_view = len(sorted_cols[view])
        sbplt_start = col_count+v*border_width
        sbplt_end = col_count+num_cols_view+v*border_width

        for i, vxtl in enumerate(view_x_labels):
            if vxtl in hilight_cols:
                edgecolor = col_hl_colors[vxtl]
                x_a = sbplt_start+i-.5
                ax.add_patch(Rectangle((x_a, -.5), 1, num_rows,
                                       facecolor="none", edgecolor=edgecolor,
                                       lw=2, zorder=10))
                fontcolor = edgecolor
                fontsize = 'x-small'
            else:
                fontcolor = '#333333'
                fontsize = 'x-small'
            font_kws = dict(color=fontcolor, fontsize=fontsize, rotation=90,
                            va='top', ha='center')
            ax.text(sbplt_start+i+.5, num_rows+.5, view_x_tick_labels[i],
                    font_kws)

        view_label_x = (sbplt_start+sbplt_end)/2. - .5
        view_label_y = -2.5
        font_kws = dict(ha='center',
                        fontsize=view_label_fontsize,
                        weight='bold')
        ax.text(view_label_x, view_label_y, view_labels[v], font_kws)

        y = 0
        for cluster in sorted_clusters[view]:
            y_tick_labels += [row_idx_to_label[row]
                              for row in sorted_rows[view][cluster]]
            ax.plot([sbplt_start-.5, sbplt_end-.5], [y-.5, y-.5],
                    color=separator_color, lw=separator_width)
            for row, label in hl_row_idx_label_zip:
                try:
                    pos = sorted_rows[view][cluster].index(row)
                except ValueError:
                    pos = None

                if pos is not None:
                    edgecolor = row_hl_colors[label]
                    ax.add_patch(Rectangle((sbplt_start - .5, y + pos - .5),
                                           num_cols_view, 1, facecolor="none",
                                           edgecolor=edgecolor, lw=2,
                                           zorder=10))

            y += len(sorted_rows[view][cluster])

        for i, row in enumerate(range(num_rows-1, -1, -1)):
            if y_tick_labels[i] in hilight_rows:
                fontcolor = row_hl_colors[y_tick_labels[i]]
                fontsize = 'x-small'
                fontweight = 'bold'
                zorder = 10
            else:
                fontsize = 'x-small'
                fontcolor = '#333333'
                fontweight = 'light'
                zorder = 5

            ax.text(sbplt_start-1, i+.5, str(y_tick_labels[i]), ha='right',
                    fontsize=fontsize, color=fontcolor, weight=fontweight,
                    zorder=zorder)
        col_count += num_cols_view

    # generate row legend
    # Use matplotlib artists to generate a list of colored lines
    # TODO: Refactor legend generator into its own function
    if legend:
        if len(hilight_rows) > 0:
            row_legend = pu.gen_collapsed_legend_from_dict(
                row_hl_colors, loc=row_legend_loc, title=row_legend_title,
                fontsize=legend_fontsize, wrap_threshold=legend_wrap_threshold)
            ax.add_artist(row_legend)

        if len(hilight_cols) > 0:
            col_legend_labels = bu.get_shortnames(bdb, table_name,
                hilight_cols)
            if descriptions_in_legend:
                for i, col_id in enumerate(hilight_cols):
                    col_legend_labels[i] += ': ' + bu.get_descriptions(
                        bdb, table_name, [col_id])[0]
                    col_legend_labels[i] = col_legend_labels[i]

            col_legend = pu.gen_collapsed_legend_from_dict(
                dict(zip(col_legend_labels, hilight_cols_colors)),
                loc=col_legend_loc, title=col_legend_title,
                fontsize=legend_fontsize, wrap_threshold=legend_wrap_threshold)

            ax.add_artist(col_legend)

    ax.tick_params(**{
        'axis': 'both',
        'length': 0
    })
    ax.set_xlim([-.5, X.shape[1]])
    ax.set_ylim([X.shape[0], -.5])
    ax.spines['bottom'].set_color('white')
    ax.spines['top'].set_color('white')
    ax.spines['right'].set_color('white')
    ax.spines['left'].set_color('white')
    ax.set_yticks(range(num_rows))
    ax.set_xticks(range(num_cols+num_views*border_width))
    ax.tick_params(axis='x', colors='white')
    # ax.set_xticklabels(x_tick_labels, rotation=90, color='black', fontsize=9)
    ax.set_yticklabels(['']*num_rows)
    ax.tick_params(axis='y', colors='white')
    ax.grid(b=False)
    ax.set_axis_bgcolor('white')
    return ax
예제 #28
0
def comparative_hist(df, bdb=None, nbins=15, normed=False):
    """Plot a histogram.

    Given a one-column pandas.DataFrame, df, plots a simple histogram. Given a
    two-column df plots the data in column one colored by an optional column 2.
    If given, column 2 must be categorical.

    Parameters
    ----------
    nbins : int
        Number of bins (bars)
    normed : bool
        If True, normalizes the the area of the histogram (or each
        sub-histogram if df has two columns) to 1.

    Returns
    -------
    figure: matplotlib.figure.Figure
    """
    df = df.dropna()

    vartype = get_bayesdb_col_type(df.columns[0], df[df.columns[0]], bdb=bdb)
    if vartype == 'categorical':
        values, labels, lookup = conv_categorical_vals_to_numeric(
            df[df.columns[0]])
        df.ix[:, 0] = values
        bins = len(labels)
        ticklabels = [0] * len(labels)
        for key, val in lookup.iteritems():
            ticklabels[val] = key
    else:
        a = min(df.ix[:, 0].values)
        b = max(df.ix[:, 0].values)
        support = b - a
        interval = support / nbins
        bins = np.linspace(a, b + interval, nbins)

    colorby = None
    if len(df.columns) > 1:
        if len(df.columns) > 2:
            raise BLE(
                NotImplementedError(
                    'comparative_hist not defined on more than two variables.')
            )
        colorby = df.columns[1]
        colorby_vals = df[colorby].unique()

    figure, ax = plt.subplots(tight_layout=False, facecolor='white')
    if colorby is None:
        ax.hist(df.ix[:, 0].values,
                bins=bins,
                color='#383838',
                edgecolor='none',
                normed=normed)
        plot_title = df.columns[0]
    else:
        colors = sns.color_palette('deep', len(colorby_vals))
        for color, cbv in zip(colors, colorby_vals):
            subdf = df[df[colorby] == cbv]
            ax.hist(subdf.ix[:, 0].values,
                    bins=bins,
                    color=color,
                    alpha=.5,
                    edgecolor='none',
                    normed=normed,
                    label=("%s (n=%d)" % (str(cbv), len(subdf))))
        ax.legend(loc=0, title=colorby)
        plot_title = df.columns[0] + " by " + colorby

    if normed:
        plot_title += " (normalized)"

    ax.set_title(plot_title)
    ax.set_xlabel(df.columns[0])
    return figure
예제 #29
0
def estimate_kl_divergence(bdb,
                           generatorA,
                           generatorB,
                           targets=None,
                           givens=None,
                           n_samples=None):
    """Estimate the KL divergence.

    The KL divergence is a mesaure of the "information lost" when generatorB
    (the approximating generator) is used to approximate generatorA (the base
    generator). KL divergence is not symmetric in, and KL(genA||genB) is not
    necessarily equal to KL(genB||genA).

    TODO: Monte Carlo estimation is a terrible way to compute the KL divergence.
    (Not to say there are better methods in general). One illustration of this
    is that the estimated KL divergence has emperically been shown to obtain
    negative realizations for high-dimensional data.

    Computing the KL divergence in general (of high dimensional distributions)
    is a very hard problem; most research uses the structure of the
    distributions to find good estimators. Adaptive quadrature or exact methods
    for numerical integration could outperform Monte Carlo?

    TODO: More sophisticated algorithm for detecting cases where absolute
    continuity could be a problem (currently have a heuristic).
    As it stands, Monte Carlo estimates may have infinite variance depending
    on simulated values from generatorA.

    Parameters
    ----------
    bdb : bayeslite.BayesDB
        Active BayesDB instance.
    generatorA : str
        Name of base generator.
    generatorB : str
        Name of approximating generator.
    targets : list<str>, optional
        List of columns in the table for which to compute the log-likelihood.
        Defaults to all the columns.
    givens : list<tuple>, optional
        A list of [(column, value)] pairs on which to condition on. Defaults to
        no conditionals. See example for more details.
    n_samples: int, optional
        Number of simulated samples to use in the Monte Carlo estimate.

    Returns
    -------
    kl : float
        The KL divergence. May be infinity.

    Example:
    estimate_kl_divergence(bdb, 'crosscat_gen', 'baxcat_gen',
        targets=['weight', 'height'],
        givens=[('nationality', 'USA'), ('age', 17)])
    """
    # XXX Default to 10,000 samples
    if n_samples is None:
        n_samples = 10000

    # Defaults to all columns if targets is None.
    targets = extract_target_cols(bdb, generatorA, targets=targets)

    # Defaults to no givens if givens is None
    givens = extract_given_cols_vals(givens=givens)
    givens = ','.join(['{}={}'.format(c, v) for (c, v) in givens])

    # Obtain samples from the base distribution.
    if givens:
        # XXX TODO write GIVEN in this query using bindings.
        bql = '''
            SIMULATE {} FROM {} GIVEN {} LIMIT {}
        '''.format(','.join(targets), bql_quote_name(generatorA), givens,
                   n_samples)
    else:
        bql = '''
            SIMULATE {} FROM {} LIMIT {}
        '''.format(','.join(targets), bql_quote_name(generatorA), n_samples)
    samples = bdb.execute(bql)

    kl = 0
    for s in samples:
        logp_a, logp_b = 0, 0
        # XXX Assume joint probability factors by summing univariate
        # (conditional) probability of each cell value. This is clearly wrong,
        # until we can evaluate joint densities in BQL.
        for col, val in zip(targets, s):
            bql = '''
                ESTIMATE PROBABILITY OF {}=? FROM {} LIMIT 1
            '''.format(col, bql_quote_name(generatorA))
            crs = bdb.execute(bql, (val, ))
            p_a = crs.fetchvalue()

            bql = '''
                ESTIMATE PROBABILITY OF {}=? FROM {} LIMIT 1
            '''.format(col, bql_quote_name(generatorB))
            crs = bdb.execute(bql, (val, ))
            p_b = crs.fetchvalue()

            # XXX Heuristic to detect when genA is not absolutely
            # continuous wrt genB
            if p_a == 0:
                # How on earth did we simulate a value from genA with zero
                # density/prob under genA?
                raise BLE(
                    ValueError(
                        'Fatal error: simulated a (col,val)=({},{}) '
                        'from base generatorA ({}) with zero density. Check '
                        'implementation of simluate and/or logpdf of '
                        'generator.'.format(col, val, generatorA)))
            if p_b == 0:
                # Detected failure of absolute continuity
                # (under assumption that joint factors into marginals)
                return float('inf')

            logp_a += math.log(p_a)
            logp_b += math.log(p_b)

        kl += (logp_a - logp_b)

    # XXX Assertion may fail, see TODO in docstring.
    # assert kl > 0
    if kl < 0:
        raise BLE(
            ValueError(
                'Cannot compute reasonable value for KL divergence. '
                'Try increasing the number of samples (currently using {}'
                'samples).'.format(n_samples)))

    return kl / n_samples
예제 #30
0
def histogram(bdb, df, nbins=15, bins=None, normed=None):
    """Plot histogram of one- or two-column table.

    If two-column, subdivide the first column according to labels in
    the second column

    Parameters
    ----------
    bdb : __population_to_bdb__
    df : __specifier_to_df__
    nbins : int, optional
        Number of bins in the histogram.
    normed : bool, optional
        If True, normalizes the the area of the histogram (or each
        sub-histogram if df has two columns) to 1.

    Returns
    ----------
    figure: matplotlib.figure.Figure
    """
    df = df.dropna()
    if len(df.columns) == 0:
        raise BLE(ValueError('Tried to plot a histogram of an empty result.'))

    vartype = get_bayesdb_col_type(df.columns[0], df[df.columns[0]], bdb=bdb)
    if vartype == 'categorical':
        raise BLE(
            TypeError(
                "Cannot histogram categorical varible %s. Barplot? Colorby?" %
                (df.columns[0], )))
    if nbins is None:
        nbins = len(bins) if bins is not None else 15
    if bins is None:
        a = min(df.ix[:, 0].values)
        b = max(df.ix[:, 0].values)
        support = b - a
        interval = support / nbins
        bins = np.linspace(a, b + interval, nbins)

    colorby = None
    if len(df.columns) > 1:
        if len(df.columns) > 2:
            raise BLE(ValueError('Got more columns than data and colorby.'))
        colorby = df.columns[1]
        colorby_stattype = get_bayesdb_col_type(df.columns[1],
                                                df[df.columns[1]],
                                                bdb=bdb)
        if colorby_stattype != 'categorical':
            raise BLE(
                TypeError("Cannot color by non-categorical variable " +
                          colorby))
        colorby_vals = df[colorby].unique()

    figure, ax = plt.subplots(tight_layout=False, facecolor='white')
    if colorby is None:
        ax.hist(df.ix[:, 0].values,
                bins=bins,
                color='#383838',
                edgecolor='none',
                normed=normed)
        plot_title = df.columns[0]
    else:
        colors = sns.color_palette('deep', len(colorby_vals))
        for color, cbv in zip(colors, colorby_vals):
            subdf = df[df[colorby] == cbv]
            ax.hist(subdf.ix[:, 0].values,
                    bins=bins,
                    color=color,
                    alpha=.5,
                    edgecolor='none',
                    normed=normed,
                    label=str(cbv))
        ax.legend(loc=0, title=colorby)
        plot_title = df.columns[0] + " by " + colorby

    if normed:
        plot_title += " (normalized)"

    ax.set_title(plot_title)
    ax.set_xlabel(df.columns[0])
    return figure