Exemple #1
0
 def __init__(self, *args):
     if len(args) != 4:
         raise ExaremeError('Illegal number of arguments.')
     self.args_X = args[0]
     self.args_Y = args[1]
     self.CategoricalVariablesWithDistinctValues = args[2]
     self.Hist = args[3]
Exemple #2
0
def main():
    # Parse arguments
    parser = ArgumentParser()
    parser.add_argument('-x', required=True, help='Variable names in x, comma separated.')
    parser.add_argument('-y', required=True, help='Variable names in y, comma separated.')
    parser.add_argument('-input_local_DB', required=True, help='Path to local db.')
    parser.add_argument('-db_query', required=True, help='Query to be executed on local db.')
    args, unknown = parser.parse_known_args()
    query = args.db_query
    fname_loc_db = path.abspath(args.input_local_DB)
    if args.x == '':
        raise ExaremeError('Field x must be non empty.')
    args_X = list(
            args.x
                .replace(' ', '')
                .split(',')
    )
    args_Y = list(
            args.y
                .replace(' ', '')
                .split(',')
    )
    # Populate schemata, treating cases Y=empty and Y=not empty accordingly (behaviour of R function `cor`)
    schema_X, schema_Y = [], []
    if args_Y == ['']:
        for i in xrange(len(args_X)):
            for j in xrange(i + 1, len(args_X)):
                schema_X.append(args_X[i])
                schema_Y.append(args_X[j])
        correlmatr_row_names = args_X
        correlmatr_col_names = args_X
    else:
        for i in xrange(len(args_X)):
            for j in xrange(len(args_Y)):
                schema_X.append(args_X[i])
                schema_Y.append(args_Y[j])
        correlmatr_col_names = args_X
        correlmatr_row_names = args_Y

    # Read data and split between X and Y matrices according to schemata
    schema, data = query_with_privacy(fname_db=fname_loc_db, query=query)
    data = np.array(data, dtype=np.float64)
    idx_X = [schema.index(v) for v in schema_X if v in schema]
    idx_Y = [schema.index(v) for v in schema_Y if v in schema]
    X = data[:, idx_X]
    Y = data[:, idx_Y]
    local_in = X, Y, schema_X, schema_Y, correlmatr_row_names, correlmatr_col_names

    # Run algorithm local step
    local_out = pearsonr_local(local_in=local_in)

    # Return the output data (should be the last command)
    local_out.transfer()
Exemple #3
0
 def __init__(self, args):
     if len(args) != 10:
         raise ExaremeError('Illegal number of arguments.')
     self.nn = args[0]
     self.sx = args[1]
     self.sy = args[2]
     self.sxx = args[3]
     self.sxy = args[4]
     self.syy = args[5]
     self.schema_X = args[6]
     self.schema_Y = args[7]
     self.correlmatr_row_names = args[8]
     self.correlmatr_col_names = args[9]
Exemple #4
0
def logregr_global_init(global_in):
    n_obs, n_cols, y_val_dict, schema_X, schema_Y = global_in.get_data()

    if n_obs == 0:
        raise ExaremeError('The selected variables contain 0 datapoints.')

    # Init vars
    ll = - 2 * n_obs * np.log(2)
    coeff = np.zeros(n_cols)
    iter = 0

    # Pack state and results
    global_state = StateData(n_obs=n_obs, n_cols=n_cols, ll=ll, coeff=coeff, iter=iter,
                             y_val_dict=y_val_dict, schema_X=schema_X, schema_Y=schema_Y)
    global_out = LogRegrIter_Glob2Loc_TD(coeff)

    return global_state, global_out
Exemple #5
0
def pearsonr_global(global_in):
    """Global step in Pearson correlation coefficient. Local statistics, computed in local step, are aggregated and
    then Pearson correlation coefficient `r`, p-value `prob` and lower and upper confidence intervals at 95% `ci_lo`
    and `ci_hi` are computed. Pearson correlation is computed according to standard formula
    (see https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#For_a_sample). The p-value is computed using
    the incomplete beta integral method. The lower and upper confidence intervals are computed usign the Fisher
    information (see https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#Using_the_Fisher_transformation).

    Parameters
    ----------
    global_in : PearsonCorrelationLocalDT
        Object holding aggregated values of statistics computed in local step.

    Returns
    -------
    global_out : str
        JSON string containing a list of results, one for each variable pair, where each result hold the variable
        pair names, the Pearson coefficient, the p-value and the lower and upper confidence intervals.
    """
    nn, sx, sy, sxx, sxy, syy, schema_X, schema_Y, correlmatr_row_names, correlmatr_col_names = global_in.get_data(
    )
    n_cols = len(nn)
    schema_out = [None] * n_cols
    r = [0] * n_cols
    prob = [0] * n_cols
    ci_lo = [0] * n_cols
    ci_hi = [0] * n_cols
    for i in xrange(n_cols):
        schema_out[i] = schema_X[i] + ' ~ ' + schema_Y[i]
        # Compute pearson correlation coefficient and p-value
        if nn[i] == 0:
            raise ExaremeError(
                'The variables chosen do not contain any datapoints.')
        else:
            d = (math.sqrt(nn[i] * sxx[i] - sx[i] * sx[i]) *
                 math.sqrt(nn[i] * syy[i] - sy[i] * sy[i]))
            if d == 0:
                r[i] = 0
            else:
                r[i] = float((nn[i] * sxy[i] - sx[i] * sy[i]) / d)
            r[i] = max(
                min(r[i], 1.0), -1.0
            )  # If abs(r) > 1 correct it: artifact of floating point arithmetic.
            df = nn[i] - 2
            if abs(r[i]) == 1.0:
                prob[i] = 0.0
            else:
                t_squared = r[i]**2 * (df / ((1.0 - r[i]) * (1.0 + r[i])))
                prob[i] = special.betainc(
                    0.5 * df, 0.5,
                    np.fmin(np.asarray(df / (df + t_squared)), 1.0))
        # Compute 95% confidence intervals
        alpha = 0.05  # Two-tail test with confidence intervals 95%
        if r[i] is not None:
            r_z = np.arctanh(r[i])
            se = 1 / np.sqrt(nn[i] - 3)
            z = st.norm.ppf(1 - alpha / 2)
            lo_z, hi_z = r_z - z * se, r_z + z * se
            ci_lo[i], ci_hi[i] = np.tanh((lo_z, hi_z))
        else:
            raise ValueError('Pearson coefficient is NaN.')

    # Format output data
    # JSON raw
    result_list = []
    for i in xrange(n_cols):
        result_list.append({
            'Variables':
            schema_out[i],
            'Pearson correlation coefficient':
            r[i],
            'p-value':
            prob[i] if prob[i] >= P_VALUE_CUTOFF else P_VALUE_CUTOFF_STR,
            'C.I. Lower':
            ci_lo[i],
            'C.I. Upper':
            ci_hi[i]
        })
    # Taabular summary
    tabular_data_summary = [[
        "variables", "Pearson correlation coefficient", "p-value",
        "lower c.i.", "upper c.i."
    ]]
    for i in xrange(n_cols):
        tabular_data_summary.append([
            schema_out[i], r[i],
            prob[i] if prob[i] >= P_VALUE_CUTOFF else P_VALUE_CUTOFF_STR,
            ci_lo[i], ci_hi[i]
        ])
    tabular_data_summary_schema_fields = [
        {
            "name": "variables",
            "type": "string"
        },
        {
            "name": "Pearson correlation coefficient",
            "type": "number"
        },
        {
            "name": "p-value",
            "type": "string"
        },
        {
            "name": "lower c.i.",
            "type": "number"
        },
        {
            "name": "upper c.i.",
            "type": "number"
        },
    ]
    # Highchart Correlation Matrix
    correlmatr_data = []
    for i, varx in enumerate(correlmatr_col_names):
        for j, vary in enumerate(correlmatr_row_names):
            if varx == vary:
                corr = 1.0
            else:
                if varx + ' ~ ' + vary in schema_out:
                    idx = schema_out.index(varx + ' ~ ' + vary)
                elif vary + ' ~ ' + varx in schema_out:
                    idx = schema_out.index(vary + ' ~ ' + varx)
                else:
                    raise ValueError('Variable names do not agree.')
                corr = r[idx]
            correlmatr_data.append({
                'x': i,
                'y': j,
                'value': round(corr, 4),
                'name': varx + ' ~ ' + vary
            })
    hichart_correl_matr = {
        'chart': {
            'type': 'heatmap',
            'plotBorderWidth': 1
        },
        'title': {
            'text': 'Pearson Correlation Matrix'
        },
        'xAxis': {
            'categories': correlmatr_col_names
        },
        'yAxis': {
            'categories': correlmatr_row_names,
            'title': 'null'
        },
        'colorAxis': {
            'stops': [[0, '#c4463a'], [0.5, '#ffffff'], [0.9, '#3060cf']],
            'min': -1,
            'max': 1,
            'minColor': '#FFFFFF',
            'maxColor': "#6699ff"
        },
        'legend': {
            'align': 'right',
            'layout': 'vertical',
            'margin': 0,
            'verticalAlign': 'top',
            'y': 25,
            'symbolHeight': 280
        },
        'tooltip': {
            'headerFormat': '',
            'pointFormat': '<b>{point.name}: {point.value}</b>',
            'enabled': True
        },
        'series': [{
            'name': 'coefficients',
            'borderWidth': 1,
            'data': correlmatr_data,
            'dataLabels': {
                'enabled': True,
                'color': '#000000'
            }
        }]
    }
    # Write output to JSON
    result = {
        'result': [
            # Raw results
            {
                "type": "application/json",
                "data": result_list
            },
            # Tabular data resource summary
            {
                "type":
                "application/vnd.dataresource+json",
                "data": [{
                    "name": "Pearson correlation summary",
                    "profile": "tabular-data-resource",
                    "data": tabular_data_summary,
                    "schema": {
                        "fields": tabular_data_summary_schema_fields
                    }
                }]
            },
            # Highchart correlation matrix
            {
                "type": "application/vnd.highcharts+json",
                "data": hichart_correl_matr
            }
        ]
    }
    try:
        global_out = json.dumps(result, allow_nan=False)
    except ValueError:
        print('Result contains NaNs.')
    else:
        return global_out
Exemple #6
0
 def __init__(self, *args):
     if len(args) != 1:
         raise ExaremeError('Illegal number of arguments.')
     self.localstatistics = args[0]
Exemple #7
0
def main():

    # Parse arguments
    parser = ArgumentParser()
    parser.add_argument('-x',
                        required=True,
                        help='Variable names, comma seperated ')
    parser.add_argument('-y',
                        required=True,
                        help='Categorical variables names, comma seperated.')
    parser.add_argument(
        '-bins',
        required=True,
        help='Dictionary of variables names (key) and number of bins (value)')
    parser.add_argument('-input_local_DB',
                        required=True,
                        help='Path to local db.')
    parser.add_argument('-db_query',
                        required=True,
                        help='Query to be executed on local db.')
    parser.add_argument(
        '-cur_state_pkl',
        required=True,
        help='Path to the pickle file holding the current state.')
    args, unknown = parser.parse_known_args()
    query = args.db_query
    fname_cur_state = path.abspath(args.cur_state_pkl)
    fname_loc_db = path.abspath(args.input_local_DB)

    if args.x == '':
        raise ExaremeError('Field x must be non empty.')

    # Get data
    if args.y == '':
        args_X = list(args.x.replace(' ', '').split(','))
        args_Y = []
        varNames = "'" + "','".join(list(args.x.replace(' ',
                                                        '').split(','))) + "'"
    else:
        args_X = list(args.x.replace(' ', '').split(','))
        args_Y = list(args.y.replace(' ', '').split(','))
        varNames = "'" + "','".join(list(args.x.replace(
            ' ', '').split(','))) + "','" + "','".join(
                list(args.y.replace(' ', '').split(','))) + "'"
    if args.bins == '':
        args_bins = {}
    else:
        args_bins = json.loads(args.bins)
        #args_bins = dict( (str(key), val) for key, val in args_bins.items())

    queryMetadata = "select * from metadata where code in (" + varNames + ");"
    dataSchema, metadataSchema, metadata, dataFrame = query_database(
        fname_db=fname_loc_db, queryData=query, queryMetadata=queryMetadata)
    CategoricalVariablesWithDistinctValues = variable_categorical_getDistinctValues(
        metadata)

    #Checking bins input
    for varx in args_X:
        if varx not in CategoricalVariablesWithDistinctValues:
            if varx not in args_bins:
                raise ExaremeError(
                    'Bin value is not defined for one at least non-categorical variable. i.e. '
                    + varx)

    # Run algorithm local step
    localStatistics = run_local_step(args_X, args_Y, args_bins, dataSchema,
                                     CategoricalVariablesWithDistinctValues,
                                     dataFrame)

    # Save local state
    local_state = StateData(args_X=args_X,
                            args_Y=args_Y,
                            args_bins=args_bins,
                            dataSchema=dataSchema,
                            CategoricalVariablesWithDistinctValues=
                            CategoricalVariablesWithDistinctValues,
                            dataFrame=dataFrame)
    local_state.save(fname=fname_cur_state)

    # Transfer local output
    local_out = multipleHist1_Loc2Glob_TD(localStatistics)
    #raise ValueError( local_out.get_data())
    local_out.transfer()