Example #1
0
def evaluate_where_on_row(row_idx, row, where_conditions, M_c, M_c_full, X_L_list, X_D_list, T, T_full, engine, tablename, numsamples, impute_confidence):
  """
  Helper function that applies WHERE conditions to row, returning False if row doesn't satisfy where
  clause, and the list of function results if it does satisfy the where clause.
  """
  function_values = []
  for (func, f_args, op, val) in where_conditions:
    if func == functions._column and f_args[1] != None and numpy.isnan(T[row_idx][f_args[0]]):
      col_idx = f_args[0]
      confidence = f_args[1]
      ## need to do predictive sampling to evaluate where condition with confidence
      ## TODO: easier way to do this would be to call impute on backend, but would need to change
      ## crosscat so that impute_and_confidence could also return the original samples, or evaluate
      ## a whereclause.
      Y = [(row_idx, cidx, row[cidx]) for cidx in M_c['name_to_idx'].values() \
           if not numpy.isnan(T[row_idx][cidx])]      
      samples = engine.call_backend('simple_predictive_sample',
                   dict(M_c=M_c, X_L=X_L_list, X_D=X_D_list, Y=Y, Q=[[row_idx,col_idx]], n=numsamples))
      samples_satisfying_where = 0
      for sample in samples:
        value = du.convert_code_to_value(M_c, col_idx, sample[0])
        if op(value, val):
          samples_satisfying_where += 1
      if float(samples_satisfying_where)/len(samples) >= confidence:
        # Where clause is satisfied! Now, generate impute summary.
        imputed_code, imputation_confidence = utils.get_imputation_and_confidence_from_samples(
          M_c, X_L_list[0], col_idx, samples)
        if imputed_code is not None:
          imputed_value = du.convert_code_to_value(M_c, col_idx, imputed_code)
        else:
          imputed_value = T[row_idx][col_idx]
        function_values.append(imputed_value)
      else:
        return False
    else:
      if func != functions._column_ignore:
        where_value = func(f_args, row_idx, row, M_c, X_L_list, X_D_list, T, engine, numsamples)
      else:
        where_value = func(f_args, row_idx, row, M_c_full, T_full, engine)
      if func == functions._row_id:
        # val should be a row list name in this case. look up the row list, and set val to be the list of
        # row indices in the row list. Throws BayesDBRowListDoesNotExistError if row list does not exist.
        val = engine.persistence_layer.get_row_list(tablename, val)
        if op(val, where_value): # for operator.contains, op(a,b) means 'b in a': so need to switch args.
          function_values.append(where_value)
        else:
          return False
      else:
        # Normal, most common condition.
        if op(where_value, val):
          function_values.append(where_value)
        else:
          return False
  return function_values
Example #2
0
def _create_histogram(M_c, data, columns, mc_col_indices, filename):
    dir = S.path.web_resources_data_dir
    full_filename = os.path.join(dir, filename)
    num_rows = data.shape[0]
    num_cols = data.shape[1]

    p.figure()
    # col_i goes from 0 to number of predicted columns
    # mc_col_idx is the original column's index in M_c
    for col_i in range(num_cols):
        mc_col_idx = mc_col_indices[col_i]
        data_i = data[:, col_i]
        ax = p.subplot(1, num_cols, col_i, title=columns[col_i])
        if M_c['column_metadata'][mc_col_idx]['modeltype'] == 'normal_inverse_gamma':
            p.hist(data_i, orientation='horizontal')
        else:
            str_data = [du.convert_code_to_value(M_c, mc_col_idx, code) for code in data_i]
            unique_labels = list(set(str_data))
            np_str_data = np.array(str_data)
            counts = []
            for label in unique_labels:
                counts.append(sum(np_str_data == label))
            num_vals = len(M_c['column_metadata'][mc_col_idx]['code_to_value'])
            rects = p.barh(range(num_vals), counts)
            heights = np.array([rect.get_height() for rect in rects])
            ax.set_yticks(np.arange(num_vals) + heights/2)
            ax.set_yticklabels(unique_labels)

    p.tight_layout()
    p.savefig(full_filename)
Example #3
0
def _create_histogram(M_c, data, columns, mc_col_indices, filename):
    dir = S.path.web_resources_data_dir
    full_filename = os.path.join(dir, filename)
    num_rows = data.shape[0]
    num_cols = data.shape[1]

    p.figure()
    # col_i goes from 0 to number of predicted columns
    # mc_col_idx is the original column's index in M_c
    for col_i in range(num_cols):
        mc_col_idx = mc_col_indices[col_i]
        data_i = data[:, col_i]
        ax = p.subplot(1, num_cols, col_i, title=columns[col_i])
        if M_c['column_metadata'][mc_col_idx][
                'modeltype'] == 'normal_inverse_gamma':
            p.hist(data_i, orientation='horizontal')
        else:
            str_data = [
                du.convert_code_to_value(M_c, mc_col_idx, code)
                for code in data_i
            ]
            unique_labels = list(set(str_data))
            np_str_data = np.array(str_data)
            counts = []
            for label in unique_labels:
                counts.append(sum(np_str_data == label))
            num_vals = len(M_c['column_metadata'][mc_col_idx]['code_to_value'])
            rects = p.barh(range(num_vals), counts)
            heights = np.array([rect.get_height() for rect in rects])
            ax.set_yticks(np.arange(num_vals) + heights / 2)
            ax.set_yticklabels(unique_labels)

    p.tight_layout()
    p.savefig(full_filename)
Example #4
0
def _column_ignore(col_idx, row_id, data_values, M_c_full, T_full, engine):
    """
    This function handles selecting data from ignore columns. It's split into a different
    function because it needs to be passed M_c_full and T_full instead of M_c and T, as in _column.
    Since selecting ignore columns is probably a rare event, we can avoid passing M_c_full and T_full
    to _column as "just in case" arguments.
    """
    return du.convert_code_to_value(M_c_full, col_idx, T_full[row_id][col_idx])    
Example #5
0
def _column(column_args, row_id, data_values, M_c, X_L_list, X_D_list, T, engine, numsamples):
    col_idx = column_args[0]
    confidence = column_args[1]
    if confidence is None or not numpy.isnan(T[row_id][col_idx]):
        return du.convert_code_to_value(M_c, col_idx, T[row_id][col_idx])
    else:
        ## Do impute.
        Y = [(row_id, cidx, T[row_id][cidx]) for cidx in M_c['name_to_idx'].values() \
                   if not numpy.isnan(T[row_id][cidx])]
        code = utils.infer(M_c, X_L_list, X_D_list, Y, row_id, col_idx, numsamples,
                           confidence, engine)
        if code is not None:
            # Inferred successfully! Fill in the new value.
            value = du.convert_code_to_value(M_c, col_idx, code)
            return value
        else:
            return du.convert_code_to_value(M_c, col_idx, T[row_id][col_idx])
Example #6
0
def _column_ignore(col_idx, row_id, data_values, M_c_full, T_full, engine):
    """
    This function handles selecting data from ignore columns. It's split into a different
    function because it needs to be passed M_c_full and T_full instead of M_c and T, as in _column.
    Since selecting ignore columns is probably a rare event, we can avoid passing M_c_full and T_full
    to _column as "just in case" arguments.
    """
    return du.convert_code_to_value(M_c_full, col_idx, T_full[row_id][col_idx])    
Example #7
0
def _column(column_args, row_id, data_values, M_c, X_L_list, X_D_list, T, engine, numsamples):
    col_idx = column_args[0]
    confidence = column_args[1]
    if confidence is None or not numpy.isnan(T[row_id][col_idx]):
        return du.convert_code_to_value(M_c, col_idx, T[row_id][col_idx])
    else:
        ## Do impute.
        Y = [(row_id, cidx, T[row_id][cidx]) for cidx in M_c['name_to_idx'].values() \
                   if not numpy.isnan(T[row_id][cidx])]
        code = utils.infer(M_c, X_L_list, X_D_list, Y, row_id, col_idx, numsamples,
                           confidence, engine)
        if code is not None:
            # Inferred successfully! Fill in the new value.
            value = du.convert_code_to_value(M_c, col_idx, code)
            return value
        else:
            return du.convert_code_to_value(M_c, col_idx, T[row_id][col_idx])
Example #8
0
def convert_row_from_codes_to_values(row, M_c):
  """
  Helper function to convert a row from its 'code' (as it's stored in T) to its 'value'
  (the human-understandable value).
  """
  ret = []
  for cidx, code in enumerate(row): 
    if not du.flexible_isnan(code):
      ret.append(du.convert_code_to_value(M_c, cidx, code))
    else:
      ret.append(code)
  return tuple(ret)
Example #9
0
def convert_row_from_codes_to_values(row, M_c):
    """
  Helper function to convert a row from its 'code' (as it's stored in T) to its 'value'
  (the human-understandable value).
  """
    ret = []
    for cidx, code in enumerate(row):
        if not du.flexible_isnan(code):
            ret.append(du.convert_code_to_value(M_c, cidx, code))
        else:
            ret.append(code)
    return tuple(ret)
Example #10
0
def evaluate_where_on_row(row_idx, row, where_conditions, M_c, M_c_full,
                          X_L_list, X_D_list, T, T_full, engine, tablename,
                          numsamples, impute_confidence):
    """
  Helper function that applies WHERE conditions to row, returning False if row doesn't satisfy where
  clause, and the list of function results if it does satisfy the where clause.
  """
    function_values = []
    for (func, f_args, op, val) in where_conditions:
        if func == functions._column and f_args[1] != None and numpy.isnan(
                T[row_idx][f_args[0]]):
            col_idx = f_args[0]
            confidence = f_args[1]
            ## need to do predictive sampling to evaluate where condition with confidence
            ## TODO: easier way to do this would be to call impute on backend, but would need to change
            ## crosscat so that impute_and_confidence could also return the original samples, or evaluate
            ## a whereclause.
            Y = [(row_idx, cidx, row[cidx]) for cidx in M_c['name_to_idx'].values() \
                 if not numpy.isnan(T[row_idx][cidx])]
            samples = engine.call_backend(
                'simple_predictive_sample',
                dict(M_c=M_c,
                     X_L=X_L_list,
                     X_D=X_D_list,
                     Y=Y,
                     Q=[[row_idx, col_idx]],
                     n=numsamples))
            samples_satisfying_where = 0
            for sample in samples:
                value = du.convert_code_to_value(M_c, col_idx, sample[0])
                if op(value, val):
                    samples_satisfying_where += 1
            if float(samples_satisfying_where) / len(samples) >= confidence:
                # Where clause is satisfied! Now, generate impute summary.
                imputed_code, imputation_confidence = utils.get_imputation_and_confidence_from_samples(
                    M_c, X_L_list[0], col_idx, samples)
                if imputed_code is not None:
                    imputed_value = du.convert_code_to_value(
                        M_c, col_idx, imputed_code)
                else:
                    imputed_value = T[row_idx][col_idx]
                function_values.append(imputed_value)
            else:
                return False
        else:
            if func != functions._column_ignore:
                where_value = func(f_args, row_idx, row, M_c, X_L_list,
                                   X_D_list, T, engine, numsamples)
            else:
                where_value = func(f_args, row_idx, row, M_c_full, T_full,
                                   engine)
            if func == functions._row_id:
                # val should be a row list name in this case. look up the row list, and set val to be the list of
                # row indices in the row list. Throws BayesDBRowListDoesNotExistError if row list does not exist.
                val = engine.persistence_layer.get_row_list(tablename, val)
                if op(
                        val, where_value
                ):  # for operator.contains, op(a,b) means 'b in a': so need to switch args.
                    function_values.append(where_value)
                else:
                    return False
            else:
                # Normal, most common condition.
                if op(where_value, val):
                    function_values.append(where_value)
                else:
                    return False
    return function_values