def _probability(probability_args, row_id, data_values, M_c, X_L_list, X_D_list, T, engine, numsamples): c_idx, value = probability_args assert type(c_idx) == int try: observed = du.convert_value_to_code(M_c, c_idx, value) except KeyError: # value doesn't exist return 0 row_id = len(X_D_list[0][0]) + 1 ## row is set to 1 + max row, instead of this row. Q = [(row_id, c_idx, observed)] Y = [] p = math.exp(engine.call_backend('simple_predictive_probability_multistate', dict(M_c=M_c, X_L_list=X_L_list, X_D_list=X_D_list, Y=Y, Q=Q))) return p
def parse_data_for_hist(colnames, data, M_c, schema_full, remove_key=False): columns = colnames[:] # Remove key column if present if remove_key: columns.pop(0) data = [row[1:] for row in data] # Remove any rows with nan values. data = [row for row in data if not any_nan(row)] # Stop if there are no rows remaining after cleaning missing values. if len(data) == 0: raise utils.BayesDBError('There are no datapoints that contain values from every category ' 'specified. Try excluding columns with many NaN values.') # Pull items from M_c to simplify code throughout the rest of this function name_to_idx = M_c['name_to_idx'] column_metadata = M_c['column_metadata'] cctypes = [schema_full[column] for column in columns] # Treat cyclic as numerical until we establish what we want in a cyclic plot. for cctype_idx, cctype in enumerate(cctypes): if cctype == 'cyclic': cctypes[cctype_idx] = 'numerical' output = {} if len(columns) == 1: np_data = np.array([x[0] for x in data]) # Allow col_idx to be None, to allow for predictive functions to be plotted. if columns[0] in name_to_idx: col_idx = name_to_idx[columns[0]] else: col_idx = None # Treat not-column (e.g. function) the same as numerical, since no code to value conversion. if col_idx is None or cctypes[0] == 'numerical': output['datatype'] = 'cont1D' output['data'] = np_data elif cctypes[0] == 'categorical': unique_labels = sorted(column_metadata[name_to_idx[columns[0]]]['code_to_value'].keys()) counts = [] for label in unique_labels: counts.append(sum(np_data == str(label))) output['datatype'] = 'mult1D' output['labels'] = unique_labels output['data'] = counts try: # try to get short names from M_c_full short_name = M_c['column_codebook'][col_idx]['short_name'] output['axis_label'] = short_name output['title'] = short_name except KeyError: output['axis_label'] = columns[0] output['title'] = columns[0] elif len(columns) == 2: # Treat not-column (e.g. function) the same as numerical, since no code to value conversion. if columns[0] in name_to_idx: col_idx_1 = name_to_idx[columns[0]] else: col_idx_1 = None if columns[1] in name_to_idx: col_idx_2 = name_to_idx[columns[1]] else: col_idx_2 = None if cctypes[0] == 'numerical' and cctypes[1] == 'numerical': output['datatype'] = 'contcont' output['data_x'] = [x[0] for x in data] output['data_y'] = [x[1] for x in data] elif cctypes[0] == 'categorical' and cctypes[1] == 'categorical': counts = {} # keys are (var 1 value, var 2 value) # data contains a tuple for each datapoint: (value of var 1, value of var 2) for row in data: row = tuple(row) if row in counts: counts[row] += 1 else: counts[row] = 1 # these are the values. unique_xs = sorted(column_metadata[col_idx_2]['code_to_value'].keys()) unique_ys = sorted(column_metadata[col_idx_1]['code_to_value'].keys()) unique_ys.reverse() # Hack to reverse the y's x_ordered_codes = [du.convert_value_to_code(M_c, col_idx_2, xval) for xval in unique_xs] y_ordered_codes = [du.convert_value_to_code(M_c, col_idx_1, yval) for yval in unique_ys] # Make count array: indexed by y index, x index counts_array = numpy.zeros(shape=(len(unique_ys), len(unique_xs))) for i in counts: # this converts from value to code y_index = y_ordered_codes.index(column_metadata[col_idx_1]['code_to_value'][i[0]]) x_index = x_ordered_codes.index(column_metadata[col_idx_2]['code_to_value'][i[1]]) counts_array[y_index][x_index] = float(counts[i]) output['datatype'] = 'multmult' output['data'] = counts_array output['labels_x'] = unique_xs output['labels_y'] = unique_ys elif 'numerical' in cctypes and 'categorical' in cctypes: output['datatype'] = 'multcont' categories = {} categorical_column = cctypes.index('categorical') groups = sorted(column_metadata[name_to_idx[columns[categorical_column]]]['code_to_value'].keys()) for i in groups: categories[i] = [] for i in data: categories[i[categorical_column]].append(i[1 - categorical_column]) output['groups'] = groups output['values'] = [categories[x] for x in groups] output['transpose'] = (categorical_column == 0) try: # try to get short names from M_c_full columns[0] = M_c['column_codebook'][col_idx_1]['short_name'] columns[1] = M_c['column_codebook'][col_idx_2]['short_name'] except KeyError: pass output['axis_label_x'] = columns[1] output['axis_label_y'] = columns[0] output['title'] = columns[0] + ' -versus- ' + columns[1] else: output['datatype'] = None return output
def parse_data_for_hist(colnames, data, M_c, schema_full, remove_key=False): columns = colnames[:] # Remove key column if present if remove_key: columns.pop(0) data = [row[1:] for row in data] # Remove any rows with nan values. data = [row for row in data if not any_nan(row)] # Stop if there are no rows remaining after cleaning missing values. if len(data) == 0: raise utils.BayesDBError( 'There are no datapoints that contain values from every category ' 'specified. Try excluding columns with many NaN values.') # Pull items from M_c to simplify code throughout the rest of this function name_to_idx = M_c['name_to_idx'] column_metadata = M_c['column_metadata'] cctypes = [schema_full[column] for column in columns] # Treat cyclic as numerical until we establish what we want in a cyclic plot. for cctype_idx, cctype in enumerate(cctypes): if cctype == 'cyclic': cctypes[cctype_idx] = 'numerical' output = {} if len(columns) == 1: np_data = np.array([x[0] for x in data]) # Allow col_idx to be None, to allow for predictive functions to be plotted. if columns[0] in name_to_idx: col_idx = name_to_idx[columns[0]] else: col_idx = None # Treat not-column (e.g. function) the same as numerical, since no code to value conversion. if col_idx is None or cctypes[0] == 'numerical': output['datatype'] = 'cont1D' output['data'] = np_data elif cctypes[0] == 'categorical': unique_labels = sorted(column_metadata[name_to_idx[columns[0]]] ['code_to_value'].keys()) counts = [] for label in unique_labels: counts.append(sum(np_data == str(label))) output['datatype'] = 'mult1D' output['labels'] = unique_labels output['data'] = counts try: # try to get short names from M_c_full short_name = M_c['column_codebook'][col_idx]['short_name'] output['axis_label'] = short_name output['title'] = short_name except KeyError: output['axis_label'] = columns[0] output['title'] = columns[0] elif len(columns) == 2: # Treat not-column (e.g. function) the same as numerical, since no code to value conversion. if columns[0] in name_to_idx: col_idx_1 = name_to_idx[columns[0]] else: col_idx_1 = None if columns[1] in name_to_idx: col_idx_2 = name_to_idx[columns[1]] else: col_idx_2 = None if cctypes[0] == 'numerical' and cctypes[1] == 'numerical': output['datatype'] = 'contcont' output['data_x'] = [x[0] for x in data] output['data_y'] = [x[1] for x in data] elif cctypes[0] == 'categorical' and cctypes[1] == 'categorical': counts = {} # keys are (var 1 value, var 2 value) # data contains a tuple for each datapoint: (value of var 1, value of var 2) for row in data: row = tuple(row) if row in counts: counts[row] += 1 else: counts[row] = 1 # these are the values. unique_xs = sorted( column_metadata[col_idx_2]['code_to_value'].keys()) unique_ys = sorted( column_metadata[col_idx_1]['code_to_value'].keys()) unique_ys.reverse() # Hack to reverse the y's x_ordered_codes = [ du.convert_value_to_code(M_c, col_idx_2, xval) for xval in unique_xs ] y_ordered_codes = [ du.convert_value_to_code(M_c, col_idx_1, yval) for yval in unique_ys ] # Make count array: indexed by y index, x index counts_array = numpy.zeros(shape=(len(unique_ys), len(unique_xs))) for i in counts: # this converts from value to code y_index = y_ordered_codes.index( column_metadata[col_idx_1]['code_to_value'][i[0]]) x_index = x_ordered_codes.index( column_metadata[col_idx_2]['code_to_value'][i[1]]) counts_array[y_index][x_index] = float(counts[i]) output['datatype'] = 'multmult' output['data'] = counts_array output['labels_x'] = unique_xs output['labels_y'] = unique_ys elif 'numerical' in cctypes and 'categorical' in cctypes: output['datatype'] = 'multcont' categories = {} categorical_column = cctypes.index('categorical') groups = sorted(column_metadata[name_to_idx[ columns[categorical_column]]]['code_to_value'].keys()) for i in groups: categories[i] = [] for i in data: categories[i[categorical_column]].append(i[1 - categorical_column]) output['groups'] = groups output['values'] = [categories[x] for x in groups] output['transpose'] = (categorical_column == 0) try: # try to get short names from M_c_full columns[0] = M_c['column_codebook'][col_idx_1]['short_name'] columns[1] = M_c['column_codebook'][col_idx_2]['short_name'] except KeyError: pass output['axis_label_x'] = columns[1] output['axis_label_y'] = columns[0] output['title'] = columns[0] + ' -versus- ' + columns[1] else: output['datatype'] = None return output
def parse_data_for_hist(colnames, data, M_c, remove_key=False): data_c = [] for i in data: no_nan = True for j in i: if isinstance(j, float) and math.isnan(j): no_nan = False if no_nan: data_c.append(i) output = {} columns = colnames[:] data_no_id = [] # This will be the data with the row_ids removed if present if remove_key: columns.pop(0) if len(data_c) == 0: raise utils.BayesDBError('There are no datapoints that contain values from every category specified. Try excluding columns with many NaN values.') if len(columns) == 1: if remove_key: data_no_id = [x[1] for x in data_c] else: data_no_id = [x[0] for x in data_c] output['axis_label'] = columns[0] output['title'] = columns[0] # Allow col_idx to be None, to allow for predictive functions to be plotted. if columns[0] in M_c['name_to_idx']: col_idx = M_c['name_to_idx'][columns[0]] else: col_idx = None # Treat not-column (e.g. function) the same as continuous, since no code to value conversion. if col_idx is None or M_c['column_metadata'][col_idx]['modeltype'] == 'normal_inverse_gamma': output['datatype'] = 'cont1D' output['data'] = np.array(data_no_id) elif M_c['column_metadata'][col_idx]['modeltype'] == 'symmetric_dirichlet_discrete': unique_labels = sorted(M_c['column_metadata'][M_c['name_to_idx'][columns[0]]]['code_to_value'].keys()) np_data = np.array(data_no_id) counts = [] for label in unique_labels: counts.append(sum(np_data==str(label))) output['datatype'] = 'mult1D' output['labels'] = unique_labels output['data'] = counts elif len(columns) == 2: if remove_key: data_no_id = [(x[1], x[2]) for x in data_c] else: data_no_id = [(x[0], x[1]) for x in data_c] types = [] # Treat not-column (e.g. function) the same as continuous, since no code to value conversion. if columns[0] in M_c['name_to_idx']: col_idx_1 = M_c['name_to_idx'][columns[0]] types.append(M_c['column_metadata'][col_idx_1]['modeltype']) else: col_idx_1 = None types.append('normal_inverse_gamma') if columns[1] in M_c['name_to_idx']: col_idx_2 = M_c['name_to_idx'][columns[1]] types.append(M_c['column_metadata'][col_idx_2]['modeltype']) else: col_idx_2 = None types.append('normal_inverse_gamma') types = tuple(types) output['axis_label_x'] = columns[1] output['axis_label_y'] = columns[0] output['title'] = columns[0] + ' -versus- ' + columns[1] if types[0] == 'normal_inverse_gamma' and types[1] == 'normal_inverse_gamma': output['datatype'] = 'contcont' output['data_x'] = [x[0] for x in data_no_id] output['data_y'] = [x[1] for x in data_no_id] elif types[0] == 'symmetric_dirichlet_discrete' and types[1] == 'symmetric_dirichlet_discrete': counts = {} # keys are (var 1 value, var 2 value) # data_no_id is a tuple for each datapoint: (value of var 1, value of var 2) for i in data_no_id: if i in counts: counts[i]+=1 else: counts[i]=1 # these are the values. unique_xs = sorted(M_c['column_metadata'][col_idx_2]['code_to_value'].keys()) unique_ys = sorted(M_c['column_metadata'][col_idx_1]['code_to_value'].keys()) unique_ys.reverse()#Hack to reverse the y's x_ordered_codes = [du.convert_value_to_code(M_c, col_idx_2, xval) for xval in unique_xs] y_ordered_codes = [du.convert_value_to_code(M_c, col_idx_1, yval) for yval in unique_ys] # Make count array: indexed by y index, x index counts_array = numpy.zeros(shape=(len(unique_ys), len(unique_xs))) for i in counts: # this converts from value to code #import pdb; pdb.set_trace() y_index = y_ordered_codes.index(M_c['column_metadata'][col_idx_1]['code_to_value'][i[0]]) x_index = x_ordered_codes.index(M_c['column_metadata'][col_idx_2]['code_to_value'][i[1]]) counts_array[y_index][x_index] = float(counts[i]) output['datatype'] = 'multmult' output['data'] = counts_array output['labels_x'] = unique_xs output['labels_y'] = unique_ys elif 'normal_inverse_gamma' in types and 'symmetric_dirichlet_discrete' in types: output['datatype'] = 'multcont' categories = {} col = 0 type = 1 if types[0] == 'normal_inverse_gamma': type = 0 col = 1 groups = sorted(M_c['column_metadata'][M_c['name_to_idx'][columns[col]]]['code_to_value'].keys()) for i in groups: categories[i] = [] for i in data_no_id: categories[i[col]].append(i[type]) output['groups'] = groups output['values'] = [categories[x] for x in groups] output['transpose'] = (type == 1) else: output['datatype'] = None return output