Exemple #1
0
 def _create_histogram(self, M_c, data, columns, mc_col_indices, filename):
     dir = S.path.web_resources_data_dir
     full_filename = os.path.join(dir, filename)
     num_rows = data.shape[0]
     num_cols = data.shape[1]
     #
     pylab.figure()
     # col_i goes from 0 to number of predicted columns
     # mc_col_idx is the original column's index in M_c
     for col_i in range(num_cols):
         mc_col_idx = mc_col_indices[col_i]
         data_i = data[:, col_i]
         ax = pylab.subplot(1, num_cols, col_i, title=columns[col_i])
         if M_c['column_metadata'][mc_col_idx][
                 'modeltype'] == 'normal_inverse_gamma':
             pylab.hist(data_i, orientation='horizontal')
         else:
             str_data = [
                 du.convert_code_to_value(M_c, mc_col_idx, code)
                 for code in data_i
             ]
             unique_labels = list(set(str_data))
             np_str_data = numpy.array(str_data)
             counts = []
             for label in unique_labels:
                 counts.append(sum(np_str_data == label))
             num_vals = len(
                 M_c['column_metadata'][mc_col_idx]['code_to_value'])
             rects = pylab.barh(range(num_vals), counts)
             heights = numpy.array([rect.get_height() for rect in rects])
             ax.set_yticks(numpy.arange(num_vals) + heights / 2)
             ax.set_yticklabels(unique_labels)
     pylab.tight_layout()
     pylab.savefig(full_filename)
Exemple #2
0
def convert_row(row, M_c):
  """
  Helper function to convert a row from its 'code' (as it's stored in T) to its 'value'
  (the human-understandable value).
  """
  ret = []
  for cidx, code in enumerate(row): 
    if not numpy.isnan(code) and not code=='nan':
      ret.append(du.convert_code_to_value(M_c, cidx, code))
    else:
      ret.append(code)
  return tuple(ret)
Exemple #3
0
def convert_row(row, M_c):
    """
  Helper function to convert a row from its 'code' (as it's stored in T) to its 'value'
  (the human-understandable value).
  """
    ret = []
    for cidx, code in enumerate(row):
        if not numpy.isnan(code) and not code == 'nan':
            ret.append(du.convert_code_to_value(M_c, cidx, code))
        else:
            ret.append(code)
    return tuple(ret)
Exemple #4
0
    def simulate(self, tablename, columnstring, newtablename, whereclause,
                 numpredictions, order_by):
        """Simple predictive samples. Returns one row per prediction, with all the given and predicted variables."""
        X_L_list, X_D_list, M_c = self.persistence_layer.get_latent_states(
            tablename)
        M_c, M_r, T = self.persistence_layer.get_metadata_and_table(tablename)

        numrows = len(M_r['idx_to_name'])
        name_to_idx = M_c['name_to_idx']

        # parse whereclause
        where_col_idxs_to_vals = dict()
        if whereclause == "" or '=' not in whereclause:
            Y = None
        else:
            varlist = [[c.strip() for c in b.split('=')]
                       for b in whereclause.split('AND')]
            Y = []
            for colname, colval in varlist:
                if type(colval) == str or type(colval) == unicode:
                    colval = ast.literal_eval(colval)
                where_col_idxs_to_vals[name_to_idx[colname]] = colval
                Y.append((numrows + 1, name_to_idx[colname], colval))

            # map values to codes
            Y = [(r, c, du.convert_value_to_code(M_c, c, colval))
                 for r, c, colval in Y]

        ## Parse queried columns.
        colnames = [colname.strip() for colname in columnstring.split(',')]
        col_indices = [name_to_idx[colname] for colname in colnames]
        query_col_indices = [
            idx for idx in col_indices
            if idx not in where_col_idxs_to_vals.keys()
        ]
        Q = [(numrows + 1, col_idx) for col_idx in query_col_indices]

        args_dict = dict()
        args_dict['M_c'] = M_c
        args_dict['X_L'] = X_L_list
        args_dict['X_D'] = X_D_list
        args_dict['Y'] = Y
        args_dict['Q'] = Q
        args_dict['n'] = numpredictions
        out = self.backend.simple_predictive_sample(M_c, X_L_list, X_D_list, Y,
                                                    Q, numpredictions)

        # convert to data, columns dict output format
        # map codes to original values
        ## TODO: Add histogram call back in, but on Python client locally!
        #self._create_histogram(M_c, numpy.array(out), columns, col_indices, tablename+'_histogram')
        data = []
        for vals in out:
            row = []
            i = 0
            for idx in col_indices:
                if idx in where_col_idxs_to_vals:
                    row.append(where_col_idxs_to_vals[idx])
                else:
                    row.append(du.convert_code_to_value(M_c, idx, vals[i]))
                    i += 1
            data.append(row)
        ret = {'message': 'Simulated data:', 'columns': colnames, 'data': data}
        return ret
Exemple #5
0
    def infer(self,
              tablename,
              columnstring,
              newtablename,
              confidence,
              whereclause,
              limit,
              numsamples,
              order_by=False):
        """Impute missing values.
    Sample INFER: INFER columnstring FROM tablename WHERE whereclause WITH confidence LIMIT limit;
    Sample INFER INTO: INFER columnstring FROM tablename WHERE whereclause WITH confidence INTO newtablename LIMIT limit;
    Argument newtablename == null/emptystring if we don't want to do INTO
    """
        # TODO: actually impute only missing values, instead of all values.
        X_L_list, X_D_list, M_c = self.persistence_layer.get_latent_states(
            tablename)
        M_c, M_r, T = self.persistence_layer.get_metadata_and_table(tablename)
        numrows = len(T)

        t_array = numpy.array(T, dtype=float)
        name_to_idx = M_c['name_to_idx']

        if '*' in columnstring:
            col_indices = name_to_idx.values()
        else:
            colnames = [colname.strip() for colname in columnstring.split(',')]
            col_indices = [name_to_idx[colname] for colname in colnames]

        Q = []
        for row_idx in range(numrows):
            for col_idx in col_indices:
                if numpy.isnan(t_array[row_idx, col_idx]):
                    Q.append([row_idx, col_idx])

        # FIXME: the purpose of the whereclause is to specify 'given'
        #        p(missing_value | X_L, X_D, whereclause)
        ## TODO: should all observed values besides the ones being imputed be givens?
        if whereclause == "" or '=' not in whereclause:
            Y = None
        else:
            varlist = [[c.strip() for c in b.split('=')]
                       for b in whereclause.split('AND')]
            Y = [(numrows + 1, name_to_idx[colname], colval)
                 for colname, colval in varlist]
            Y = [(r, c, du.convert_value_to_code(M_c, c, colval))
                 for r, c, colval in Y]

        # Call backend
        args_dict = dict()
        args_dict['M_c'] = M_c
        args_dict['X_L'] = X_L_list
        args_dict['X_D'] = X_D_list
        args_dict['Y'] = Y  # givens
        args_dict['n'] = numsamples
        counter = 0
        ret = []
        for q in Q:
            args_dict['Q'] = q  # querys
            out = self.backend.impute_and_confidence(M_c, X_L_list, X_D_list,
                                                     Y, [q], numsamples)
            value, conf = out
            if conf >= confidence:
                row_idx = q[0]
                col_idx = q[1]
                ret.append((row_idx, col_idx, value))
                counter += 1
                if counter >= limit:
                    break
        imputations_list = [(r, c, du.convert_code_to_value(M_c, c, code))
                            for r, c, code in ret]
        ## Convert into dict with r,c keys
        imputations_dict = defaultdict(dict)
        for r, c, val in imputations_list:
            imputations_dict[r][c] = val
        ret = self.select(tablename,
                          columnstring,
                          whereclause,
                          limit,
                          order_by=order_by,
                          imputations_dict=imputations_dict)
        return ret