def __init__(self, *args): if len(args) != 4: raise ExaremeError('Illegal number of arguments.') self.args_X = args[0] self.args_Y = args[1] self.CategoricalVariablesWithDistinctValues = args[2] self.Hist = args[3]
def main(): # Parse arguments parser = ArgumentParser() parser.add_argument('-x', required=True, help='Variable names in x, comma separated.') parser.add_argument('-y', required=True, help='Variable names in y, comma separated.') parser.add_argument('-input_local_DB', required=True, help='Path to local db.') parser.add_argument('-db_query', required=True, help='Query to be executed on local db.') args, unknown = parser.parse_known_args() query = args.db_query fname_loc_db = path.abspath(args.input_local_DB) if args.x == '': raise ExaremeError('Field x must be non empty.') args_X = list( args.x .replace(' ', '') .split(',') ) args_Y = list( args.y .replace(' ', '') .split(',') ) # Populate schemata, treating cases Y=empty and Y=not empty accordingly (behaviour of R function `cor`) schema_X, schema_Y = [], [] if args_Y == ['']: for i in xrange(len(args_X)): for j in xrange(i + 1, len(args_X)): schema_X.append(args_X[i]) schema_Y.append(args_X[j]) correlmatr_row_names = args_X correlmatr_col_names = args_X else: for i in xrange(len(args_X)): for j in xrange(len(args_Y)): schema_X.append(args_X[i]) schema_Y.append(args_Y[j]) correlmatr_col_names = args_X correlmatr_row_names = args_Y # Read data and split between X and Y matrices according to schemata schema, data = query_with_privacy(fname_db=fname_loc_db, query=query) data = np.array(data, dtype=np.float64) idx_X = [schema.index(v) for v in schema_X if v in schema] idx_Y = [schema.index(v) for v in schema_Y if v in schema] X = data[:, idx_X] Y = data[:, idx_Y] local_in = X, Y, schema_X, schema_Y, correlmatr_row_names, correlmatr_col_names # Run algorithm local step local_out = pearsonr_local(local_in=local_in) # Return the output data (should be the last command) local_out.transfer()
def __init__(self, args): if len(args) != 10: raise ExaremeError('Illegal number of arguments.') self.nn = args[0] self.sx = args[1] self.sy = args[2] self.sxx = args[3] self.sxy = args[4] self.syy = args[5] self.schema_X = args[6] self.schema_Y = args[7] self.correlmatr_row_names = args[8] self.correlmatr_col_names = args[9]
def logregr_global_init(global_in): n_obs, n_cols, y_val_dict, schema_X, schema_Y = global_in.get_data() if n_obs == 0: raise ExaremeError('The selected variables contain 0 datapoints.') # Init vars ll = - 2 * n_obs * np.log(2) coeff = np.zeros(n_cols) iter = 0 # Pack state and results global_state = StateData(n_obs=n_obs, n_cols=n_cols, ll=ll, coeff=coeff, iter=iter, y_val_dict=y_val_dict, schema_X=schema_X, schema_Y=schema_Y) global_out = LogRegrIter_Glob2Loc_TD(coeff) return global_state, global_out
def pearsonr_global(global_in): """Global step in Pearson correlation coefficient. Local statistics, computed in local step, are aggregated and then Pearson correlation coefficient `r`, p-value `prob` and lower and upper confidence intervals at 95% `ci_lo` and `ci_hi` are computed. Pearson correlation is computed according to standard formula (see https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#For_a_sample). The p-value is computed using the incomplete beta integral method. The lower and upper confidence intervals are computed usign the Fisher information (see https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#Using_the_Fisher_transformation). Parameters ---------- global_in : PearsonCorrelationLocalDT Object holding aggregated values of statistics computed in local step. Returns ------- global_out : str JSON string containing a list of results, one for each variable pair, where each result hold the variable pair names, the Pearson coefficient, the p-value and the lower and upper confidence intervals. """ nn, sx, sy, sxx, sxy, syy, schema_X, schema_Y, correlmatr_row_names, correlmatr_col_names = global_in.get_data( ) n_cols = len(nn) schema_out = [None] * n_cols r = [0] * n_cols prob = [0] * n_cols ci_lo = [0] * n_cols ci_hi = [0] * n_cols for i in xrange(n_cols): schema_out[i] = schema_X[i] + ' ~ ' + schema_Y[i] # Compute pearson correlation coefficient and p-value if nn[i] == 0: raise ExaremeError( 'The variables chosen do not contain any datapoints.') else: d = (math.sqrt(nn[i] * sxx[i] - sx[i] * sx[i]) * math.sqrt(nn[i] * syy[i] - sy[i] * sy[i])) if d == 0: r[i] = 0 else: r[i] = float((nn[i] * sxy[i] - sx[i] * sy[i]) / d) r[i] = max( min(r[i], 1.0), -1.0 ) # If abs(r) > 1 correct it: artifact of floating point arithmetic. df = nn[i] - 2 if abs(r[i]) == 1.0: prob[i] = 0.0 else: t_squared = r[i]**2 * (df / ((1.0 - r[i]) * (1.0 + r[i]))) prob[i] = special.betainc( 0.5 * df, 0.5, np.fmin(np.asarray(df / (df + t_squared)), 1.0)) # Compute 95% confidence intervals alpha = 0.05 # Two-tail test with confidence intervals 95% if r[i] is not None: r_z = np.arctanh(r[i]) se = 1 / np.sqrt(nn[i] - 3) z = st.norm.ppf(1 - alpha / 2) lo_z, hi_z = r_z - z * se, r_z + z * se ci_lo[i], ci_hi[i] = np.tanh((lo_z, hi_z)) else: raise ValueError('Pearson coefficient is NaN.') # Format output data # JSON raw result_list = [] for i in xrange(n_cols): result_list.append({ 'Variables': schema_out[i], 'Pearson correlation coefficient': r[i], 'p-value': prob[i] if prob[i] >= P_VALUE_CUTOFF else P_VALUE_CUTOFF_STR, 'C.I. Lower': ci_lo[i], 'C.I. Upper': ci_hi[i] }) # Taabular summary tabular_data_summary = [[ "variables", "Pearson correlation coefficient", "p-value", "lower c.i.", "upper c.i." ]] for i in xrange(n_cols): tabular_data_summary.append([ schema_out[i], r[i], prob[i] if prob[i] >= P_VALUE_CUTOFF else P_VALUE_CUTOFF_STR, ci_lo[i], ci_hi[i] ]) tabular_data_summary_schema_fields = [ { "name": "variables", "type": "string" }, { "name": "Pearson correlation coefficient", "type": "number" }, { "name": "p-value", "type": "string" }, { "name": "lower c.i.", "type": "number" }, { "name": "upper c.i.", "type": "number" }, ] # Highchart Correlation Matrix correlmatr_data = [] for i, varx in enumerate(correlmatr_col_names): for j, vary in enumerate(correlmatr_row_names): if varx == vary: corr = 1.0 else: if varx + ' ~ ' + vary in schema_out: idx = schema_out.index(varx + ' ~ ' + vary) elif vary + ' ~ ' + varx in schema_out: idx = schema_out.index(vary + ' ~ ' + varx) else: raise ValueError('Variable names do not agree.') corr = r[idx] correlmatr_data.append({ 'x': i, 'y': j, 'value': round(corr, 4), 'name': varx + ' ~ ' + vary }) hichart_correl_matr = { 'chart': { 'type': 'heatmap', 'plotBorderWidth': 1 }, 'title': { 'text': 'Pearson Correlation Matrix' }, 'xAxis': { 'categories': correlmatr_col_names }, 'yAxis': { 'categories': correlmatr_row_names, 'title': 'null' }, 'colorAxis': { 'stops': [[0, '#c4463a'], [0.5, '#ffffff'], [0.9, '#3060cf']], 'min': -1, 'max': 1, 'minColor': '#FFFFFF', 'maxColor': "#6699ff" }, 'legend': { 'align': 'right', 'layout': 'vertical', 'margin': 0, 'verticalAlign': 'top', 'y': 25, 'symbolHeight': 280 }, 'tooltip': { 'headerFormat': '', 'pointFormat': '<b>{point.name}: {point.value}</b>', 'enabled': True }, 'series': [{ 'name': 'coefficients', 'borderWidth': 1, 'data': correlmatr_data, 'dataLabels': { 'enabled': True, 'color': '#000000' } }] } # Write output to JSON result = { 'result': [ # Raw results { "type": "application/json", "data": result_list }, # Tabular data resource summary { "type": "application/vnd.dataresource+json", "data": [{ "name": "Pearson correlation summary", "profile": "tabular-data-resource", "data": tabular_data_summary, "schema": { "fields": tabular_data_summary_schema_fields } }] }, # Highchart correlation matrix { "type": "application/vnd.highcharts+json", "data": hichart_correl_matr } ] } try: global_out = json.dumps(result, allow_nan=False) except ValueError: print('Result contains NaNs.') else: return global_out
def __init__(self, *args): if len(args) != 1: raise ExaremeError('Illegal number of arguments.') self.localstatistics = args[0]
def main(): # Parse arguments parser = ArgumentParser() parser.add_argument('-x', required=True, help='Variable names, comma seperated ') parser.add_argument('-y', required=True, help='Categorical variables names, comma seperated.') parser.add_argument( '-bins', required=True, help='Dictionary of variables names (key) and number of bins (value)') parser.add_argument('-input_local_DB', required=True, help='Path to local db.') parser.add_argument('-db_query', required=True, help='Query to be executed on local db.') parser.add_argument( '-cur_state_pkl', required=True, help='Path to the pickle file holding the current state.') args, unknown = parser.parse_known_args() query = args.db_query fname_cur_state = path.abspath(args.cur_state_pkl) fname_loc_db = path.abspath(args.input_local_DB) if args.x == '': raise ExaremeError('Field x must be non empty.') # Get data if args.y == '': args_X = list(args.x.replace(' ', '').split(',')) args_Y = [] varNames = "'" + "','".join(list(args.x.replace(' ', '').split(','))) + "'" else: args_X = list(args.x.replace(' ', '').split(',')) args_Y = list(args.y.replace(' ', '').split(',')) varNames = "'" + "','".join(list(args.x.replace( ' ', '').split(','))) + "','" + "','".join( list(args.y.replace(' ', '').split(','))) + "'" if args.bins == '': args_bins = {} else: args_bins = json.loads(args.bins) #args_bins = dict( (str(key), val) for key, val in args_bins.items()) queryMetadata = "select * from metadata where code in (" + varNames + ");" dataSchema, metadataSchema, metadata, dataFrame = query_database( fname_db=fname_loc_db, queryData=query, queryMetadata=queryMetadata) CategoricalVariablesWithDistinctValues = variable_categorical_getDistinctValues( metadata) #Checking bins input for varx in args_X: if varx not in CategoricalVariablesWithDistinctValues: if varx not in args_bins: raise ExaremeError( 'Bin value is not defined for one at least non-categorical variable. i.e. ' + varx) # Run algorithm local step localStatistics = run_local_step(args_X, args_Y, args_bins, dataSchema, CategoricalVariablesWithDistinctValues, dataFrame) # Save local state local_state = StateData(args_X=args_X, args_Y=args_Y, args_bins=args_bins, dataSchema=dataSchema, CategoricalVariablesWithDistinctValues= CategoricalVariablesWithDistinctValues, dataFrame=dataFrame) local_state.save(fname=fname_cur_state) # Transfer local output local_out = multipleHist1_Loc2Glob_TD(localStatistics) #raise ValueError( local_out.get_data()) local_out.transfer()