Ejemplo n.º 1
0
def _probability(probability_args, row_id, data_values, M_c, X_L_list, X_D_list, T, engine, numsamples):
    c_idx, value = probability_args
    assert type(c_idx) == int
    try:
        observed = du.convert_value_to_code(M_c, c_idx, value)
    except KeyError:
        # value doesn't exist
        return 0
    row_id = len(X_D_list[0][0]) + 1 ## row is set to 1 + max row, instead of this row.
    Q = [(row_id, c_idx, observed)]
    Y = []
    p = math.exp(engine.call_backend('simple_predictive_probability_multistate', dict(M_c=M_c, X_L_list=X_L_list, X_D_list=X_D_list, Y=Y, Q=Q)))
    return p
Ejemplo n.º 2
0
def _probability(probability_args, row_id, data_values, M_c, X_L_list, X_D_list, T, engine, numsamples):
    c_idx, value = probability_args
    assert type(c_idx) == int
    try:
        observed = du.convert_value_to_code(M_c, c_idx, value)
    except KeyError:
        # value doesn't exist
        return 0
    row_id = len(X_D_list[0][0]) + 1 ## row is set to 1 + max row, instead of this row.
    Q = [(row_id, c_idx, observed)]
    Y = []
    p = math.exp(engine.call_backend('simple_predictive_probability_multistate', dict(M_c=M_c, X_L_list=X_L_list, X_D_list=X_D_list, Y=Y, Q=Q)))
    return p
Ejemplo n.º 3
0
def parse_data_for_hist(colnames, data, M_c, schema_full, remove_key=False):
    columns = colnames[:]
    # Remove key column if present
    if remove_key:
        columns.pop(0)
        data = [row[1:] for row in data]
    # Remove any rows with nan values.
    data = [row for row in data if not any_nan(row)]
    # Stop if there are no rows remaining after cleaning missing values.
    if len(data) == 0:
        raise utils.BayesDBError('There are no datapoints that contain values from every category '
                                 'specified. Try excluding columns with many NaN values.')

    # Pull items from M_c to simplify code throughout the rest of this function
    name_to_idx = M_c['name_to_idx']
    column_metadata = M_c['column_metadata']
    cctypes = [schema_full[column] for column in columns]

    # Treat cyclic as numerical until we establish what we want in a cyclic plot.
    for cctype_idx, cctype in enumerate(cctypes):
        if cctype == 'cyclic':
            cctypes[cctype_idx] = 'numerical'

    output = {}
    if len(columns) == 1:
        np_data = np.array([x[0] for x in data])

        # Allow col_idx to be None, to allow for predictive functions to be plotted.
        if columns[0] in name_to_idx:
            col_idx = name_to_idx[columns[0]]
        else:
            col_idx = None

        # Treat not-column (e.g. function) the same as numerical, since no code to value conversion.
        if col_idx is None or cctypes[0] == 'numerical':
            output['datatype'] = 'cont1D'
            output['data'] = np_data
        elif cctypes[0] == 'categorical':
            unique_labels = sorted(column_metadata[name_to_idx[columns[0]]]['code_to_value'].keys())
            counts = []
            for label in unique_labels:
                counts.append(sum(np_data == str(label)))
            output['datatype'] = 'mult1D'
            output['labels'] = unique_labels
            output['data'] = counts

        try:
            # try to get short names from M_c_full
            short_name = M_c['column_codebook'][col_idx]['short_name']
            output['axis_label'] = short_name
            output['title'] = short_name
        except KeyError:
            output['axis_label'] = columns[0]
            output['title'] = columns[0]

    elif len(columns) == 2:
        # Treat not-column (e.g. function) the same as numerical, since no code to value conversion.
        if columns[0] in name_to_idx:
            col_idx_1 = name_to_idx[columns[0]]
        else:
            col_idx_1 = None
        if columns[1] in name_to_idx:
            col_idx_2 = name_to_idx[columns[1]]
        else:
            col_idx_2 = None

        if cctypes[0] == 'numerical' and cctypes[1] == 'numerical':
            output['datatype'] = 'contcont'
            output['data_x'] = [x[0] for x in data]
            output['data_y'] = [x[1] for x in data]

        elif cctypes[0] == 'categorical' and cctypes[1] == 'categorical':
            counts = {}  # keys are (var 1 value, var 2 value)
            # data contains a tuple for each datapoint: (value of var 1, value of var 2)
            for row in data:
                row = tuple(row)
                if row in counts:
                    counts[row] += 1
                else:
                    counts[row] = 1

            # these are the values.
            unique_xs = sorted(column_metadata[col_idx_2]['code_to_value'].keys())
            unique_ys = sorted(column_metadata[col_idx_1]['code_to_value'].keys())
            unique_ys.reverse()  # Hack to reverse the y's
            x_ordered_codes = [du.convert_value_to_code(M_c, col_idx_2, xval) for xval in unique_xs]
            y_ordered_codes = [du.convert_value_to_code(M_c, col_idx_1, yval) for yval in unique_ys]

            # Make count array: indexed by y index, x index
            counts_array = numpy.zeros(shape=(len(unique_ys), len(unique_xs)))
            for i in counts:
                # this converts from value to code
                y_index = y_ordered_codes.index(column_metadata[col_idx_1]['code_to_value'][i[0]])
                x_index = x_ordered_codes.index(column_metadata[col_idx_2]['code_to_value'][i[1]])
                counts_array[y_index][x_index] = float(counts[i])
            output['datatype'] = 'multmult'
            output['data'] = counts_array
            output['labels_x'] = unique_xs
            output['labels_y'] = unique_ys

        elif 'numerical' in cctypes and 'categorical' in cctypes:
            output['datatype'] = 'multcont'
            categories = {}

            categorical_column = cctypes.index('categorical')

            groups = sorted(column_metadata[name_to_idx[columns[categorical_column]]]['code_to_value'].keys())
            for i in groups:
                categories[i] = []
            for i in data:
                categories[i[categorical_column]].append(i[1 - categorical_column])

            output['groups'] = groups
            output['values'] = [categories[x] for x in groups]
            output['transpose'] = (categorical_column == 0)

        try:
            # try to get short names from M_c_full
            columns[0] = M_c['column_codebook'][col_idx_1]['short_name']
            columns[1] = M_c['column_codebook'][col_idx_2]['short_name']
        except KeyError:
            pass

        output['axis_label_x'] = columns[1]
        output['axis_label_y'] = columns[0]

        output['title'] = columns[0] + ' -versus- ' + columns[1]

    else:
        output['datatype'] = None

    return output
Ejemplo n.º 4
0
def parse_data_for_hist(colnames, data, M_c, schema_full, remove_key=False):
    columns = colnames[:]
    # Remove key column if present
    if remove_key:
        columns.pop(0)
        data = [row[1:] for row in data]
    # Remove any rows with nan values.
    data = [row for row in data if not any_nan(row)]
    # Stop if there are no rows remaining after cleaning missing values.
    if len(data) == 0:
        raise utils.BayesDBError(
            'There are no datapoints that contain values from every category '
            'specified. Try excluding columns with many NaN values.')

    # Pull items from M_c to simplify code throughout the rest of this function
    name_to_idx = M_c['name_to_idx']
    column_metadata = M_c['column_metadata']
    cctypes = [schema_full[column] for column in columns]

    # Treat cyclic as numerical until we establish what we want in a cyclic plot.
    for cctype_idx, cctype in enumerate(cctypes):
        if cctype == 'cyclic':
            cctypes[cctype_idx] = 'numerical'

    output = {}
    if len(columns) == 1:
        np_data = np.array([x[0] for x in data])

        # Allow col_idx to be None, to allow for predictive functions to be plotted.
        if columns[0] in name_to_idx:
            col_idx = name_to_idx[columns[0]]
        else:
            col_idx = None

        # Treat not-column (e.g. function) the same as numerical, since no code to value conversion.
        if col_idx is None or cctypes[0] == 'numerical':
            output['datatype'] = 'cont1D'
            output['data'] = np_data
        elif cctypes[0] == 'categorical':
            unique_labels = sorted(column_metadata[name_to_idx[columns[0]]]
                                   ['code_to_value'].keys())
            counts = []
            for label in unique_labels:
                counts.append(sum(np_data == str(label)))
            output['datatype'] = 'mult1D'
            output['labels'] = unique_labels
            output['data'] = counts

        try:
            # try to get short names from M_c_full
            short_name = M_c['column_codebook'][col_idx]['short_name']
            output['axis_label'] = short_name
            output['title'] = short_name
        except KeyError:
            output['axis_label'] = columns[0]
            output['title'] = columns[0]

    elif len(columns) == 2:
        # Treat not-column (e.g. function) the same as numerical, since no code to value conversion.
        if columns[0] in name_to_idx:
            col_idx_1 = name_to_idx[columns[0]]
        else:
            col_idx_1 = None
        if columns[1] in name_to_idx:
            col_idx_2 = name_to_idx[columns[1]]
        else:
            col_idx_2 = None

        if cctypes[0] == 'numerical' and cctypes[1] == 'numerical':
            output['datatype'] = 'contcont'
            output['data_x'] = [x[0] for x in data]
            output['data_y'] = [x[1] for x in data]

        elif cctypes[0] == 'categorical' and cctypes[1] == 'categorical':
            counts = {}  # keys are (var 1 value, var 2 value)
            # data contains a tuple for each datapoint: (value of var 1, value of var 2)
            for row in data:
                row = tuple(row)
                if row in counts:
                    counts[row] += 1
                else:
                    counts[row] = 1

            # these are the values.
            unique_xs = sorted(
                column_metadata[col_idx_2]['code_to_value'].keys())
            unique_ys = sorted(
                column_metadata[col_idx_1]['code_to_value'].keys())
            unique_ys.reverse()  # Hack to reverse the y's
            x_ordered_codes = [
                du.convert_value_to_code(M_c, col_idx_2, xval)
                for xval in unique_xs
            ]
            y_ordered_codes = [
                du.convert_value_to_code(M_c, col_idx_1, yval)
                for yval in unique_ys
            ]

            # Make count array: indexed by y index, x index
            counts_array = numpy.zeros(shape=(len(unique_ys), len(unique_xs)))
            for i in counts:
                # this converts from value to code
                y_index = y_ordered_codes.index(
                    column_metadata[col_idx_1]['code_to_value'][i[0]])
                x_index = x_ordered_codes.index(
                    column_metadata[col_idx_2]['code_to_value'][i[1]])
                counts_array[y_index][x_index] = float(counts[i])
            output['datatype'] = 'multmult'
            output['data'] = counts_array
            output['labels_x'] = unique_xs
            output['labels_y'] = unique_ys

        elif 'numerical' in cctypes and 'categorical' in cctypes:
            output['datatype'] = 'multcont'
            categories = {}

            categorical_column = cctypes.index('categorical')

            groups = sorted(column_metadata[name_to_idx[
                columns[categorical_column]]]['code_to_value'].keys())
            for i in groups:
                categories[i] = []
            for i in data:
                categories[i[categorical_column]].append(i[1 -
                                                           categorical_column])

            output['groups'] = groups
            output['values'] = [categories[x] for x in groups]
            output['transpose'] = (categorical_column == 0)

        try:
            # try to get short names from M_c_full
            columns[0] = M_c['column_codebook'][col_idx_1]['short_name']
            columns[1] = M_c['column_codebook'][col_idx_2]['short_name']
        except KeyError:
            pass

        output['axis_label_x'] = columns[1]
        output['axis_label_y'] = columns[0]

        output['title'] = columns[0] + ' -versus- ' + columns[1]

    else:
        output['datatype'] = None

    return output
Ejemplo n.º 5
0
def parse_data_for_hist(colnames, data, M_c, remove_key=False):
    data_c = []
    for i in data:
        no_nan = True
        for j in i:
            if isinstance(j, float) and math.isnan(j):
                no_nan = False
        if no_nan:
            data_c.append(i)
    output = {}
    columns = colnames[:]
    data_no_id = [] # This will be the data with the row_ids removed if present
    if remove_key:
        columns.pop(0)
    if len(data_c) == 0:
        raise utils.BayesDBError('There are no datapoints that contain values from every category specified. Try excluding columns with many NaN values.')
    if len(columns) == 1:
        if remove_key:
            data_no_id = [x[1] for x in data_c]
        else:
            data_no_id = [x[0] for x in data_c]
        output['axis_label'] = columns[0]
        output['title'] = columns[0]

        # Allow col_idx to be None, to allow for predictive functions to be plotted.
        if columns[0] in M_c['name_to_idx']:
            col_idx = M_c['name_to_idx'][columns[0]]
        else:
            col_idx = None

        # Treat not-column (e.g. function) the same as continuous, since no code to value conversion.            
        if col_idx is None or M_c['column_metadata'][col_idx]['modeltype'] == 'normal_inverse_gamma':
            output['datatype'] = 'cont1D'
            output['data'] = np.array(data_no_id)
            
        elif M_c['column_metadata'][col_idx]['modeltype'] == 'symmetric_dirichlet_discrete':
            unique_labels = sorted(M_c['column_metadata'][M_c['name_to_idx'][columns[0]]]['code_to_value'].keys())
            np_data = np.array(data_no_id)
            counts = []
            for label in unique_labels:
                counts.append(sum(np_data==str(label)))
            output['datatype'] = 'mult1D'
            output['labels'] = unique_labels
            output['data'] = counts

    elif len(columns) == 2:
        if remove_key:
            data_no_id = [(x[1], x[2]) for x in data_c]
        else:
            data_no_id = [(x[0], x[1]) for x in data_c]

        types = []

        # Treat not-column (e.g. function) the same as continuous, since no code to value conversion.
        if columns[0] in M_c['name_to_idx']:
            col_idx_1 = M_c['name_to_idx'][columns[0]]
            types.append(M_c['column_metadata'][col_idx_1]['modeltype'])
        else:
            col_idx_1 = None
            types.append('normal_inverse_gamma')
        if columns[1] in M_c['name_to_idx']:
            col_idx_2 = M_c['name_to_idx'][columns[1]]
            types.append(M_c['column_metadata'][col_idx_2]['modeltype'])            
        else:
            col_idx_2 = None
            types.append('normal_inverse_gamma')            
        types = tuple(types)
        
        output['axis_label_x'] = columns[1]
        output['axis_label_y'] = columns[0]
        output['title'] = columns[0] + ' -versus- ' + columns[1]
 
        if types[0] == 'normal_inverse_gamma' and types[1] == 'normal_inverse_gamma':
            output['datatype'] = 'contcont'
            output['data_x'] = [x[0] for x in data_no_id]
            output['data_y'] = [x[1] for x in data_no_id]

        elif types[0] == 'symmetric_dirichlet_discrete' and types[1] == 'symmetric_dirichlet_discrete':
            counts = {} # keys are (var 1 value, var 2 value)
            # data_no_id is a tuple for each datapoint: (value of var 1, value of var 2)
            for i in data_no_id:
                if i in counts:
                    counts[i]+=1
                else:
                    counts[i]=1

            # these are the values.
            unique_xs = sorted(M_c['column_metadata'][col_idx_2]['code_to_value'].keys())
            unique_ys = sorted(M_c['column_metadata'][col_idx_1]['code_to_value'].keys())
            unique_ys.reverse()#Hack to reverse the y's            
            x_ordered_codes = [du.convert_value_to_code(M_c, col_idx_2, xval) for xval in unique_xs]
            y_ordered_codes = [du.convert_value_to_code(M_c, col_idx_1, yval) for yval in unique_ys]

            # Make count array: indexed by y index, x index
            counts_array = numpy.zeros(shape=(len(unique_ys), len(unique_xs)))
            for i in counts:
                # this converts from value to code
                #import pdb; pdb.set_trace()
                y_index = y_ordered_codes.index(M_c['column_metadata'][col_idx_1]['code_to_value'][i[0]])
                x_index = x_ordered_codes.index(M_c['column_metadata'][col_idx_2]['code_to_value'][i[1]])
                counts_array[y_index][x_index] = float(counts[i])
            output['datatype'] = 'multmult'
            output['data'] = counts_array
            output['labels_x'] = unique_xs
            output['labels_y'] = unique_ys

        elif 'normal_inverse_gamma' in types and 'symmetric_dirichlet_discrete' in types:
            output['datatype'] = 'multcont'
            categories = {}

            col = 0
            type = 1
            if types[0] == 'normal_inverse_gamma':
                type = 0
                col = 1
            
            groups = sorted(M_c['column_metadata'][M_c['name_to_idx'][columns[col]]]['code_to_value'].keys())
            for i in groups:
                categories[i] = []
            for i in data_no_id:
                categories[i[col]].append(i[type])
                
            output['groups'] = groups
            output['values'] = [categories[x] for x in groups]
            output['transpose'] = (type == 1)

    else:
        output['datatype'] = None
    return output