def parcorr_z(field: xr.DataArray, ts: np.ndarray, z: pd.DataFrame, lag_z: int = 0): ''' Regress out influence of 1-d timeseries z. if lag_z==0, dates of z will match dates of field. Note, lag_z >= 1 probably only makes sense when using subseasonal data (more then 1 value per year). Parameters ---------- field : xr.DataArray (time, lat, lon) field. ts : np.ndarray Target timeseries. z : pd.DataFrame 1-d timeseries. Returns ------- corr_vals : np.ndarray pvals : np.ndarray ''' # if more then one year is filled with NaNs -> no corr value calculated. dates = pd.to_datetime(field.time.values) field, ts = check_NaNs(field, ts) x = np.ma.zeros(field.shape[1]) corr_vals = np.array(x) pvals = np.array(x) fieldnans = np.array([np.isnan(field[:, i]).any() for i in range(x.size)]) nonans_gc = np.arange(0, fieldnans.size)[fieldnans == False] # ts = np.expand_dims(ts[:], axis=1) # adjust to shape (samples, dimension) and remove first datapoints if # lag_z != 0. y = np.expand_dims(ts[lag_z:], axis=1) if len(z.values.squeeze().shape) == 1: z = np.expand_dims(z.loc[dates].values.squeeze(), axis=1) else: z = z.loc[dates].values.squeeze() if lag_z >= 1: z = z[:-lag_z] # last values are 'removed' for i in nonans_gc: cond_ind_test = ParCorr() field_i = np.expand_dims(field[lag_z:, i], axis=1) a, b = cond_ind_test.run_test_raw(field_i, y, z) corr_vals[i] = a pvals[i] = b # restore original nans corr_vals[fieldnans] = np.nan return corr_vals, pvals
def test_mci(self): # Setting up strict test level pc_alpha = 0.05 #[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5] tau_max = 2 alpha_level = 0.01 dataframe = pp.DataFrame(self.data) cond_ind_test = ParCorr(verbosity=verbosity) pcmci = PCMCI(selected_variables=None, dataframe=dataframe, cond_ind_test=cond_ind_test, verbosity=verbosity) results = pcmci.run_mci( selected_links=None, tau_min=1, tau_max=tau_max, parents=self.true_parents, max_conds_py=None, max_conds_px=None, ) parents = pcmci._return_significant_parents( pq_matrix=results['p_matrix'], val_matrix=results['val_matrix'], alpha_level=alpha_level, )['parents'] # print parents # print _get_parent_graph(true_parents) assert_graphs_equal(parents, self.true_parents)
def test_pc_stable_max_conds_dim(self): # Setting up strict test level pc_alpha = 0.05 #[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5] tau_max = 2 alpha_level = 0.01 # true_parents_here = {0: [], # 1: [(1, -1), (0, -1)], # 2: [] # } dataframe = pp.DataFrame(self.data) cond_ind_test = ParCorr(verbosity=verbosity) pcmci = PCMCI(selected_variables=None, dataframe=dataframe, cond_ind_test=cond_ind_test, verbosity=verbosity) pcmci.run_pc_stable( selected_links=None, tau_min=1, tau_max=tau_max, save_iterations=False, pc_alpha=pc_alpha, max_conds_dim=2, max_combinations=1, ) parents = pcmci.all_parents # print parents # print _get_parent_graph(true_parents) assert_graphs_equal(parents, self.true_parents)
def time_lagged_correlation(): """Runs the time-lagged correlation analysis experiment. This function alculates the time-lagged correlation between the variables for lags of up to 48 hours and plots the result as a scatterplot matrix. """ var_names = [ "dayOfYear", "minuteOfYear", "minuteOfDay", "dayOfWeek", "isWeekend", "humidity_sensor", "temperature", "precip_intensity", "cloud_cover", "p1", "p2", "dew_point", "wind_speed" ] tau_min = 0 tau_max = 48 dataframe, var_list = generate_dataframe(var_names) print(f"Variable names: {var_names}") ci_test = ParCorr(significance='analytic') pcmci = PCMCI(dataframe=dataframe, cond_ind_test=ci_test, verbosity=1) correlations = pcmci.get_lagged_dependencies(tau_min=tau_min, tau_max=tau_max) lag_func_matrix = tp.plot_lagfuncs( name="experiments/causal_discovery/results/time_lagged_correlation.png", val_matrix=correlations, setup_args={ 'var_names': var_names, 'figsize': (50, 25), 'label_fontsize': 12, 'label_space_top': 0.025, 'label_space_left': 0.05, 'lag_units': 'hours', 'x_base': 6, 'y_base': .5 }) print(lag_func_matrix)
def test_predictions(data_frame_a): # TODO NOTE: This doesn't actually test if the predictions make sense, only # that they work! # Get the data (dataframe, true_parents), links_coeffs = data_frame_a T, _ = dataframe.values.shape # Build the prediction a_cond_ind_test = ParCorr(significance='analytic', fixed_thres=0.01) pred = Prediction(dataframe=dataframe, cond_ind_test=a_cond_ind_test, prediction_model=sklearn.linear_model.LinearRegression(), train_indices=range(int(0.8 * T)), test_indices=range(int(0.8 * T), T), verbosity=0) # Load some parameters tau_max = 3 steps_ahead = 1 target = 2 # Get the predictors from pc_stable all_predictors = pred.get_predictors(selected_targets=[target], selected_links=None, steps_ahead=steps_ahead, tau_max=tau_max, pc_alpha=None, max_conds_dim=None, max_combinations=1) # Fit the predictors using the ML method pred.fit(target_predictors=all_predictors, selected_targets=[target], tau_max=tau_max, return_data=True) # Predict the values _ = pred.predict(target)
def test_pcmci(self): # Setting up strict test level pc_alpha = 0.05 #[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5] tau_max = 2 alpha_level = 0.01 dataframe = pp.DataFrame(self.data) cond_ind_test = ParCorr(verbosity=verbosity) pcmci = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test, verbosity=verbosity) results = pcmci.run_pcmci( tau_max=tau_max, pc_alpha=pc_alpha, ) parents = pcmci._return_significant_parents( pq_matrix=results['p_matrix'], val_matrix=results['val_matrix'], alpha_level=alpha_level)['parents'] # print parents # print self.true_parents assert_graphs_equal(parents, self.true_parents)
def pcmci_setup(data): dataframe = pp.DataFrame(data.values, var_names=list(data.columns)) parcorr = ParCorr(significance='analytic') pcmci = PCMCI( dataframe=dataframe, cond_ind_test=parcorr, verbosity=1) return pcmci
def a_pcmci(a_sample, request): # Unpack the test data and true parent graph dataframe, true_parents = a_sample # Build the PCMCI instance pcmci = PCMCI(dataframe=dataframe, cond_ind_test=ParCorr(verbosity=VERBOSITY), verbosity=VERBOSITY) # Return the constructed PCMCI, expected results, and common parameters return pcmci, true_parents
def setUp(self): auto = 0.6 coeff = 0.6 T = 1000 numpy.random.seed(42) # True graph links_coeffs = { 0: [((0, -1), auto)], 1: [((1, -1), auto), ((0, -1), coeff)], 2: [((2, -1), auto), ((1, -1), coeff)] } self.data, self.true_parents_coeffs = pp.var_process(links_coeffs, T=T) T, N = self.data.shape self.ci_par_corr = ParCorr(use_mask=False, mask_type=None, significance='analytic', fixed_thres=None, sig_samples=10000, sig_blocklength=3, confidence='analytic', conf_lev=0.9, conf_samples=10000, conf_blocklength=1, recycle_residuals=False, verbosity=0) self.ci_gpdc = GPDC(significance='analytic', sig_samples=1000, sig_blocklength=1, confidence='bootstrap', conf_lev=0.9, conf_samples=100, conf_blocklength=None, use_mask=False, mask_type='y', recycle_residuals=False, verbosity=0)
def a_run_pcmciplus(a_pcmciplus, a_pcmciplus_params): # Unpack the pcmci and the true parents, and common parameters dataframe, true_graph, links_coeffs, tau_min, tau_max = a_pcmciplus # Unpack the parameters ( pc_alpha, contemp_collider_rule, conflict_resolution, reset_lagged_links, cond_ind_test_class, ) = a_pcmciplus_params if cond_ind_test_class == 'oracle_ci': cond_ind_test = OracleCI(links_coeffs) elif cond_ind_test_class == 'par_corr': cond_ind_test = ParCorr() # Run the PCMCI algorithm with the given parameters pcmci = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test, verbosity=2) results = pcmci.run_pcmciplus( selected_links=None, tau_min=tau_min, tau_max=tau_max, pc_alpha=pc_alpha, contemp_collider_rule=contemp_collider_rule, conflict_resolution=conflict_resolution, reset_lagged_links=reset_lagged_links, max_conds_dim=None, max_conds_py=None, max_conds_px=None, ) # Print true links print("************************") print("\nTrue Graph") for lag in range(tau_max): print("Lag %d = ", lag) print(true_graph[:, :, lag]) # pcmci.print_significant_links( # p_matrix=(true_graph != ""), # val_matrix=true_graph, # conf_matrix=None, # q_matrix=None, # graph=true_graph, # ambiguous_triples=None, # alpha_level=0.05) # Return the results and the expected result return results['graph'], true_graph
def par_corr(request): # Unpack the parameters sig, recycle, conf = request.param # Generate the par_corr independence test return ParCorr(mask_type=None, significance=sig, fixed_thres=0.1, sig_samples=10000, sig_blocklength=3, confidence=conf, conf_lev=0.9, conf_samples=10000, conf_blocklength=1, recycle_residuals=recycle, verbosity=0)
def linear_causal_model(): """Runs the linear causal model experiment mentioned in the report. This function creates causal linear models for a variety of different alphas and plots the results as network and timeseries graphs. The maximum lag is 24. """ var_names = [ "dayOfYear", "minuteOfYear", "minuteOfDay", "dayOfWeek", "isWeekend", "humidity_sensor", "temperature", "precip_intensity", "cloud_cover", "p1", "p2", "dew_point", "wind_speed" ] tau_min = 0 tau_max = 24 dataframe, var_list = generate_dataframe(var_names) print(f"Variable names: {var_names}") ci_test = ParCorr(significance='analytic') alphas = [3**-n for n in range(1, 9)] test_alphas(dataframe, ci_test, alphas, var_names, tau_min=tau_min, tau_max=tau_max)
def build_link_pcmci_noself(p_data_values, p_agent_names, p_var_sou, p_var_tar): """ build links by n column data """ [times_num, agent_num] = p_data_values.shape # set the data for PCMCI data_frame = pp.DataFrame(p_data_values, var_names=p_agent_names, missing_flag=BaseConfig.BACKGROUND_VALUE) # new PCMCI pcmci = PCMCI(dataframe=data_frame, cond_ind_test=ParCorr()) # run PCMCI alpha_level = 0.01 results_pcmci = pcmci.run_pcmciplus(tau_min=0, tau_max=2, pc_alpha=alpha_level) # get the result graph_pcmci = results_pcmci['graph'] q_matrix = results_pcmci['q_matrix'] p_matrix = results_pcmci['p_matrix'] val_matrix = results_pcmci['val_matrix'] conf_matrix = results_pcmci['conf_matrix'] ambiguous_triples = results_pcmci['ambiguous_triples'] # filter these links links_df = pd.DataFrame(columns=('VarSou', 'VarTar', 'Source', 'Target', 'TimeLag', 'Strength', 'Unoriented')) if graph_pcmci is not None: sig_links = (graph_pcmci != "") * (graph_pcmci != "<--") elif q_matrix is not None: sig_links = (q_matrix <= alpha_level) else: sig_links = (p_matrix <= alpha_level) for j in range(agent_num): links = {(p[0], -p[1]): np.abs(val_matrix[p[0], j, abs(p[1])]) for p in zip(*np.where(sig_links[:, j, :]))} # Sort by value sorted_links = sorted(links, key=links.get, reverse=True) for p in sorted_links: VarSou = p_var_sou VarTar = p_var_tar Source = p_agent_names[j] Target = p_agent_names[p[0]] TimeLag = p[1] Strength = val_matrix[p[0], j, abs(p[1])] Unoriented = None if graph_pcmci is not None: if p[1] == 0 and graph_pcmci[j, p[0], 0] == "o-o": Unoriented = 1 # "unoriented link" elif graph_pcmci[p[0], j, abs(p[1])] == "x-x": Unoriented = 1 # "unclear orientation due to conflict" else: Unoriented = 0 links_df = links_df.append(pd.DataFrame({ 'VarSou': [VarSou], 'VarTar': [VarTar], 'Source': [Source], 'Target': [Target], 'TimeLag': [TimeLag], 'Strength': [Strength], 'Unoriented': [Unoriented] }), ignore_index=True) # remove the self correlation edges links_df = links_df.loc[links_df['Source'] != links_df['Target']] return links_df
def run(self): data, _ = pp.var_process(self.links_coeffs, T=1000) dataframe = pp.DataFrame(data) cond_ind_test = ParCorr() self.pcmciobj = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test) self.results = self.pcmciobj.run_pcmci(tau_max=2, pc_alpha=None)
def parcorr_map_time(field: xr.DataArray, ts: np.ndarray, lag_y=0, lag_x=0): ''' Only works for subseasonal data (more then 1 datapoint per year). Lag must be >= 1 Parameters ---------- field : xr.DataArray (time, lat, lon) field. ts : np.ndarray Target timeseries. lag : int, optional DESCRIPTION. The default is 1. target : TYPE, optional DESCRIPTION. The default is True. precursor : TYPE, optional DESCRIPTION. The default is True. Returns ------- corr_vals : np.ndarray pvals : np.ndarray ''' # field = precur_train.sel(time=dates_lag) ; ts = RV_ts.values.squeeze() if type(lag_y) is int: lag_y = [lag_y] if type(lag_x) is int: lag_x = [lag_x] max_lag = max(max(lag_y), max(lag_x)) assert max_lag > 0, 'lag_x or lag_y must be >= 1' # if more then one year is filled with NaNs -> no corr value calculated. field, ts = check_NaNs(field, ts) x = np.ma.zeros(field.shape[1]) corr_vals = np.array(x) pvals = np.array(x) fieldnans = np.array([np.isnan(field[:, i]).any() for i in range(x.size)]) nonans_gc = np.arange(0, fieldnans.size)[fieldnans == False] if max(lag_y) > 0: zy = [ np.expand_dims(ts[max_lag - l:-l], axis=1) for l in lag_y if l != 0 ] zy = np.concatenate(zy, axis=1) y = np.expand_dims(ts[max_lag:], axis=1) for i in nonans_gc: cond_ind_test = ParCorr() if max(lag_x) > 0: zx = [ np.expand_dims(field[max_lag - l:-l, i], axis=1) for l in lag_x if l != 0 ] zx = np.concatenate(zx, axis=1) if max(lag_x) > 0 and max(lag_y) > 0: # both zy and zx defined z = np.concatenate((zy, zx), axis=1) elif max(lag_x) > 0 and max(lag_y) == 0: # only zx defined z = zx elif max(lag_x) == 0 and max(lag_y) > 0: z = zy field_i = np.expand_dims(field[max_lag:, i], axis=1) a, b = cond_ind_test.run_test_raw(field_i, y, z) corr_vals[i] = a pvals[i] = b # restore original nans corr_vals[fieldnans] = np.nan return corr_vals, pvals
data[:, 2] = nc_file.variables["WNPI"][:] data[:, 3] = nc_file.variables["ENSOI"][:] a = nc_file.variables["NAOIN"][:] b = nc_file.variables["NAOIS"][:] data[:, 4] = a - b data_mask = np.zeros(data.shape) for t in range(1, T + 1): if (t % 73) >= 12 and (t % 73) <= 30: data_mask[t - 1, :] = True # Initialize dataframe object, specify time axis and variable names var_names = ['WPSH', 'IO', 'WNP', 'ENSO', 'NAO'] dataframe = pp.DataFrame(data, mask=data_mask) parcorr = ParCorr(significance='analytic', mask_type='xyz') pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr) results = pcmci.run_pcmci(tau_max=12, pc_alpha=0.03) # Correct p-values q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') # Plotting link_matrix = pcmci.return_significant_parents( pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=0.03)['link_matrix'] tp.plot_graph(val_matrix=results['val_matrix'], link_matrix=link_matrix, var_names=var_names)
T, N = data.shape # ====================================================================================================================== # Initialize dataframe object (needed for tigramite functions) # ====================================================================================================================== dataframe = pp.DataFrame(data=data, mask=data_mask) # Specify time axis and variable names datatime = np.arange(len(data)) # ====================================================================================================================== # pc algorithm: only parents for selected_variables are calculated (here entry[0] = PoV) # ====================================================================================================================== parcorr = ParCorr(significance='analytic', use_mask=True, mask_type='y', verbosity=2) pcmci = PCMCI( dataframe=dataframe, cond_ind_test=parcorr, var_names=var_names, selected_variables= None, #[0], # only parents for the monsoon trough rainfall verbosity=2) # ====================================================================================================================== # results = pcmci.run_pcmci(tau_max=tau_max, pc_alpha = pc_alpha, tau_min = tau_min, max_combinations=1 ) # ====================================================================================================================== if p == 0: pc_alpha = pcA_set1a
def df_data_remove_z(df_data, z=[str, list], keys=None, standardize: bool = True, plot: bool = True): ''' Parameters ---------- df_data : pd.DataFrame DataFrame containing timeseries. z : str, optional variable z, of which influence will be remove of columns in keys. The default is str. Returns ------- None. ''' method = ParCorr() if type(z) is str: z = [z] if keys is None: discard = ['TrainIsTrue', 'RV_mask'] + z keys = [k for k in df_data.columns if k not in discard] npstore = np.zeros(shape=(len(keys), df_data.index.levels[0].size, df_data.index.levels[1].size)) for i, orig in enumerate(keys): orig = keys[i] # create fake X, Y format, needed for function _get_single_residuals dfxy = df_data[[orig]].merge(df_data[[orig ]].copy().rename({orig: 'copy'}, axis=1), left_index=True, right_index=True).copy() # Append Z timeseries dfxyz = dfxy.merge(df_data[z], left_index=True, right_index=True) for s in df_data.index.levels[0]: dfxyz_s = dfxyz.loc[s].copy() if all(dfxyz_s[orig].isna().values): npstore[i, s, :] = dfxyz_s[orig].values # fill in all nans else: npstore[i, s, :] = method._get_single_residuals( np.moveaxis(dfxyz_s.values, 0, 1), 0, standardize=standardize) df_new = pd.DataFrame(np.moveaxis(npstore, 0, 2).reshape(-1, len(keys)), index=df_data.index, columns=keys) if plot: fig, axes = plt.subplots(len(keys), 1, figsize=(10, 2.5 * len(keys)), sharex=True) if len(keys) == 1: axes = [axes] for i, k in enumerate(keys): df_data[k].loc[0].plot(ax=axes[i], label=f'{k} original', legend=False, color='green', lw=1, alpha=.8) df_new[k].loc[0].plot(ax=axes[i], label=f'{z} regressed out', legend=False, color='blue', lw=1) axes[i].legend() out = (df_new, fig) else: out = (df_new) return out
def calculate(para_setup): para_setup_string, sam = para_setup paras = para_setup_string.split('-') paras = [w.replace("'", "") for w in paras] model = str(paras[0]) N = int(paras[1]) n_links = int(paras[2]) min_coeff = float(paras[3]) coeff = float(paras[4]) auto = float(paras[5]) contemp_fraction = float(paras[6]) frac_unobserved = float(paras[7]) max_true_lag = int(paras[8]) T = int(paras[9]) ci_test = str(paras[10]) method = str(paras[11]) pc_alpha = float(paras[12]) tau_max = int(paras[13]) ############################################# ## Data ############################################# def lin_f(x): return x def f2(x): return (x + 5. * x**2 * np.exp(-x**2 / 20.)) if model == 'autobidirected': if verbosity > 999: model_seed = verbosity - 1000 else: model_seed = sam random_state = np.random.RandomState(model_seed) links = { 0: [((0, -1), auto, lin_f), ((1, -1), coeff, lin_f)], 1: [], 2: [((2, -1), auto, lin_f), ((1, -1), coeff, lin_f)], 3: [((3, -1), auto, lin_f), ((2, -1), min_coeff, lin_f)], } observed_vars = [0, 2, 3] noises = [random_state.randn for j in range(len(links))] data_all, nonstationary = mod.generate_nonlinear_contemp_timeseries( links=links, T=T, noises=noises, random_state=random_state) data = data_all[:, observed_vars] elif 'random' in model: if 'lineargaussian' in model: coupling_funcs = [lin_f] noise_types = ['gaussian'] #, 'weibull', 'uniform'] noise_sigma = (0.5, 2) elif 'nonlinearmixed' in model: coupling_funcs = [lin_f, f2] noise_types = ['gaussian', 'gaussian', 'weibull'] noise_sigma = (0.5, 2) if coeff < min_coeff: min_coeff = coeff couplings = list(np.arange(min_coeff, coeff + 0.1, 0.1)) couplings += [-c for c in couplings] auto_deps = list(np.arange(max(0., auto - 0.3), auto + 0.01, 0.05)) # Models may be non-stationary. Hence, we iterate over a number of seeds # to find a stationary one regarding network topology, noises, etc if verbosity > 999: model_seed = verbosity - 1000 else: model_seed = sam for ir in range(1000): # np.random.seed(model_seed) random_state = np.random.RandomState(model_seed) N_all = math.floor((N / (1. - frac_unobserved))) n_links_all = math.ceil(n_links / N * N_all) observed_vars = np.sort( random_state.choice(range(N_all), size=math.ceil( (1. - frac_unobserved) * N_all), replace=False)).tolist() links = mod.generate_random_contemp_model( N=N_all, L=n_links_all, coupling_coeffs=couplings, coupling_funcs=coupling_funcs, auto_coeffs=auto_deps, tau_max=max_true_lag, contemp_fraction=contemp_fraction, # num_trials=1000, random_state=random_state) class noise_model: def __init__(self, sigma=1): self.sigma = sigma def gaussian(self, T): # Get zero-mean unit variance gaussian distribution return self.sigma * random_state.randn(T) def weibull(self, T): # Get zero-mean sigma variance weibull distribution a = 2 mean = scipy.special.gamma(1. / a + 1) variance = scipy.special.gamma( 2. / a + 1) - scipy.special.gamma(1. / a + 1)**2 return self.sigma * (random_state.weibull(a=a, size=T) - mean) / np.sqrt(variance) def uniform(self, T): # Get zero-mean sigma variance uniform distribution mean = 0.5 variance = 1. / 12. return self.sigma * (random_state.uniform(size=T) - mean) / np.sqrt(variance) noises = [] for j in links: noise_type = random_state.choice(noise_types) sigma = noise_sigma[0] + ( noise_sigma[1] - noise_sigma[0]) * random_state.rand() noises.append(getattr(noise_model(sigma), noise_type)) if 'discretebinom' in model: if 'binom2' in model: n_binom = 2 elif 'binom4' in model: n_binom = 4 data_all_check, nonstationary = discretized_scp( links=links, T=T + 10000, n_binom=n_binom, random_state=random_state) else: data_all_check, nonstationary = mod.generate_nonlinear_contemp_timeseries( links=links, T=T + 10000, noises=noises, random_state=random_state) # If the model is stationary, break the loop if not nonstationary: data_all = data_all_check[:T] data = data_all[:, observed_vars] break else: print("Trial %d: Not a stationary model" % ir) model_seed += 10000 else: raise ValueError("model %s not known" % model) if nonstationary: raise ValueError("No stationary model found: %s" % model) true_graph = utilities._get_pag_from_dag(links, observed_vars=observed_vars, tau_max=tau_max, verbosity=verbosity)[1] if verbosity > 0: print("True Links") for j in links: print(j, links[j]) print("observed_vars = ", observed_vars) print("True PAG") if tau_max > 0: for lag in range(tau_max + 1): print(true_graph[:, :, lag]) else: print(true_graph.squeeze()) if plot_data: print("PLOTTING") for j in range(N): # ax = fig.add_subplot(N,1,j+1) pyplot.plot(data[:, j]) pyplot.show() computation_time_start = time.time() dataframe = pp.DataFrame(data) ############################################# ## Methods ############################################# # Specify conditional independence test object if ci_test == 'par_corr': cond_ind_test = ParCorr(significance='analytic', recycle_residuals=True) elif ci_test == 'cmi_knn': cond_ind_test = CMIknn(knn=0.1, sig_samples=500, sig_blocklength=1) elif ci_test == 'gp_dc': cond_ind_test = GPDC(recycle_residuals=True) elif ci_test == 'discg2': cond_ind_test = DiscG2() else: raise ValueError("CI test not recognized.") if 'lpcmci' in method: method_paras = method.split('_') n_preliminary_iterations = int(method_paras[1][7:]) if 'prelimonly' in method: prelim_only = True else: prelim_only = False lpcmci = LPCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test) lpcmcires = lpcmci.run_lpcmci( tau_max=tau_max, pc_alpha=pc_alpha, max_p_non_ancestral=3, n_preliminary_iterations=n_preliminary_iterations, prelim_only=prelim_only, verbosity=verbosity) graph = lpcmci.graph val_min = lpcmci.val_min_matrix max_cardinality = lpcmci.cardinality_matrix elif method == 'svarfci': svarfci = SVARFCI(dataframe=dataframe, cond_ind_test=cond_ind_test) svarfcires = svarfci.run_svarfci( tau_max=tau_max, pc_alpha=pc_alpha, max_cond_px=0, max_p_dsep=3, fix_all_edges_before_final_orientation=True, verbosity=verbosity) graph = svarfci.graph val_min = svarfci.val_min_matrix max_cardinality = svarfci.cardinality_matrix elif method == 'svarrfci': svarrfci = SVARRFCI(dataframe=dataframe, cond_ind_test=cond_ind_test) svarrfcires = svarrfci.run_svarrfci( tau_max=tau_max, pc_alpha=pc_alpha, fix_all_edges_before_final_orientation=True, verbosity=verbosity) graph = svarrfci.graph val_min = svarrfci.val_min_matrix max_cardinality = svarrfci.cardinality_matrix else: raise ValueError("%s not implemented." % method) computation_time_end = time.time() computation_time = computation_time_end - computation_time_start return { 'true_graph': true_graph, 'val_min': val_min, 'max_cardinality': max_cardinality, # Method results 'computation_time': computation_time, 'graph': graph, }
def init_pcmci(df_data, significance='analytic', mask_type='y', selected_variables=None, verbosity=5): ''' First initializing pcmci object for each training set. This allows to plot lagged cross-correlations which help to identity a reasonably tau_max. Parameters ---------- df_data : pandas DataFrame df_data is retrieved by running rg.get_ts_prec(). significance : str, optional DESCRIPTION. The default is 'analytic'. mask_type : str, optional DESCRIPTION. The default is 'y'. verbosity : int, optional DESCRIPTION. The default is 4. selected_variables : list of integers, optional (default: None) Specify to estimate parents only for selected variables. If None is passed, parents are estimated for all variables. Returns ------- dictionary of format {split:pcmci}. ''' splits = df_data.index.levels[0] pcmci_dict = {} RV_mask = df_data['RV_mask'] for s in range(splits.size): TrainIsTrue = df_data['TrainIsTrue'].loc[s] df_data_s = df_data.loc[s][TrainIsTrue == True] df_data_s = df_data_s.dropna(axis=1, how='all') if any(df_data_s.isna().values.flatten()): if verbosity > 0: print('Warnning: nans detected') # print(np.unique(df_data_s.isna().values)) var_names = [ k for k in df_data_s.columns if k not in ['TrainIsTrue', 'RV_mask'] ] df_data_s = df_data_s.loc[:, var_names] data = df_data_s.values data_mask = ~RV_mask.loc[s][TrainIsTrue == True].values # indices with mask == False are used (with mask_type 'y') data_mask = np.repeat(data_mask, data.shape[1]).reshape(data.shape) # create dataframe in Tigramite format dataframe = pp.DataFrame(data=data, mask=data_mask, var_names=var_names) parcorr = ParCorr(significance=significance, mask_type=mask_type, verbosity=0) parcorr.verbosity = verbosity # to avoid print init text each time # ====================================================================================================================== # pc algorithm: only parents for selected_variables are calculated # ====================================================================================================================== pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=verbosity) pcmci_dict[s] = pcmci return pcmci_dict
def run_PCMCI(ex, outdic_actors, s, df_splits, map_proj): #===================================================================================== # # 4) PCMCI-algorithm # #===================================================================================== # save output if ex['SaveTF'] == True: # from contextlib import redirect_stdout orig_stdout = sys.stdout # buffer print statement output to f if sys.version[:1] == '3': sys.stdout = f = io.StringIO() elif sys.version[:1] == '2': sys.stdout = f = open(os.path.join(ex['fig_subpath'], 'old.txt'), 'w+') #%% # amount of text printed: verbosity = 3 # alpha level for independence test within the pc procedure (finding parents) pc_alpha = ex['pcA_sets'][ex['pcA_set']] # alpha level for multiple linear regression model while conditining on parents of # parents alpha_level = ex['alpha_level_tig'] print('run tigramite 4, run.pcmci') print(('alpha level(s) for independence tests within the pc procedure' '(finding parents): {}'.format(pc_alpha))) print(( 'alpha level for multiple linear regression model while conditining on parents of ' 'parents: {}'.format(ex['alpha_level_tig']))) # Retrieve traintest info traintest = df_splits # load Response Variable class RV = ex[ex['RV_name']] # create list with all actors, these will be merged into the fulldata array allvar = ex['vars'][0] var_names_corr = [] actorlist = [] cols = [[RV.name]] for var in allvar[:]: print(var) actor = outdic_actors[var] if actor.ts_corr[s].size != 0: ts_train = actor.ts_corr[s].values actorlist.append(ts_train) # create array which numbers the regions var_idx = allvar.index(var) n_regions = actor.ts_corr[s].shape[1] actor.var_info = [[i + 1, actor.ts_corr[s].columns[i], var_idx] for i in range(n_regions)] # Array of corresponing regions with var_names_corr (first entry is RV) var_names_corr = var_names_corr + actor.var_info cols.append(list(actor.ts_corr[s].columns)) index_dates = actor.ts_corr[s].index var_names_corr.insert(0, RV.name) # stack actor time-series together: fulldata = np.concatenate(tuple(actorlist), axis=1) print(('There are {} regions in total'.format(fulldata.shape[1]))) # add the full 1D time series of interest as first entry: fulldata = np.column_stack((RV.RVfullts, fulldata)) df_data = pd.DataFrame(fulldata, columns=flatten(cols), index=index_dates) if ex['import_prec_ts'] == True: var_names_full = var_names_corr.copy() for d in ex['precursor_ts']: path_data = d[1] if len(path_data) > 1: path_data = ''.join(list(path_data)) # skip first col because it is the RV ts df_data_ext = func_fc.load_hdf5( path_data)['df_data'].iloc[:, 1:].loc[s] cols_ts = np.logical_or(df_data_ext.dtypes == 'float64', df_data_ext.dtypes == 'float32') cols_ext = list(df_data_ext.columns[cols_ts]) # cols_ext must be of format '{}_{int}_{}' lab_int = 100 for i, c in enumerate(cols_ext): char = c.split('_')[1] if char.isdigit(): pass else: cols_ext[i] = c.replace(char, str(lab_int)) + char lab_int += 1 df_data_ext = df_data_ext[cols_ext] to_freq = ex['tfreq'] if to_freq != 1: start_end_date = (ex['sstartdate'], ex['senddate']) start_end_year = (ex['startyear'], ex['endyear']) df_data_ext = functions_pp.time_mean_bins(df_data_ext, to_freq, start_end_date, start_end_year, seldays='part')[0] # df_data_ext = functions_pp.time_mean_bins(df_data_ext, # ex, ex['tfreq'], # seldays='part')[0] # Expand var_names_corr n = var_names_full[-1][0] + 1 add_n = n + len(cols_ext) n_var_idx = var_names_full[-1][-1] + 1 for i in range(n, add_n): var_names_full.append([i, cols_ext[i - n], n_var_idx]) df_data = df_data.merge(df_data_ext, left_index=True, right_index=True) else: var_names_full = var_names_corr bool_train = traintest.loc[s]['TrainIsTrue'] bool_RV_train = np.logical_and(bool_train, traintest.loc[s]['RV_mask']) dates_train = traintest.loc[s]['TrainIsTrue'][bool_train].index dates_RV_train = traintest.loc[s]['TrainIsTrue'][bool_RV_train].index RVfull_train = RV.RVfullts.sel(time=dates_train) datesfull_train = pd.to_datetime(RVfull_train.time.values) data = df_data.loc[datesfull_train].values print((data.shape)) # get RV datamask (same shape als data) data_mask = [ True if d in dates_RV_train else False for d in datesfull_train ] data_mask = np.repeat(data_mask, data.shape[1]).reshape(data.shape) # add traintest mask to fulldata # dates_all = pd.to_datetime(RV.RVfullts.index) # dates_RV = pd.to_datetime(RV.RV_ts.index) dates_all = pd.to_datetime(RV.RVfullts.time.values) dates_RV = pd.to_datetime(RV.RV_ts.time.values) df_data['TrainIsTrue'] = [ True if d in datesfull_train else False for d in dates_all ] df_data['RV_mask'] = [True if d in dates_RV else False for d in dates_all] # ====================================================================================================================== # tigramite 3 # ====================================================================================================================== T, N = data.shape # Time, Regions # ====================================================================================================================== # Initialize dataframe object (needed for tigramite functions) # ====================================================================================================================== dataframe = pp.DataFrame(data=data, mask=data_mask, var_names=var_names_full) # ====================================================================================================================== # pc algorithm: only parents for selected_variables are calculated # ====================================================================================================================== parcorr = ParCorr(significance='analytic', mask_type='y', verbosity=verbosity) #========================================================================== # multiple testing problem: #========================================================================== pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, selected_variables=None, verbosity=4) # selected_variables : list of integers, optional (default: range(N)) # Specify to estimate parents only for selected variables. If None is # passed, parents are estimated for all variables. # ====================================================================================================================== #selected_links = dictionary/None results = pcmci.run_pcmci(tau_max=ex['tigr_tau_max'], pc_alpha=pc_alpha, tau_min=0, max_combinations=ex['max_comb_actors']) q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') pcmci.print_significant_links(p_matrix=results['p_matrix'], q_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=alpha_level) # returns all parents, not just causal precursors (of lag>0) sig = rgcpd.return_sign_parents(pcmci, pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=alpha_level) all_parents = sig['parents'] # link_matrix = sig['link_matrix'] links_RV = all_parents[0] df = rgcpd.bookkeeping_precursors(links_RV, var_names_full) #%% rgcpd.print_particular_region_new(links_RV, var_names_corr, s, outdic_actors, map_proj, ex) #%% if ex['SaveTF'] == True: if sys.version[:1] == '3': fname = f's{s}_' + ex['params'] + '.txt' file = io.open(os.path.join(ex['fig_subpath'], fname), mode='w+') file.write(f.getvalue()) file.close() f.close() elif sys.version[:1] == '2': f.close() sys.stdout = orig_stdout return df, df_data
def a_test(request): return ParCorr(verbosity=VERBOSITY)
def test_order_independence_pcmciplus(a_pcmciplus_order_independence, a_pcmciplus_params_order_independence): # Unpack the pcmci and the true parents, and common parameters dataframe, true_graph, links_coeffs, tau_min, tau_max = \ a_pcmciplus_order_independence data = dataframe.values T, N = data.shape # Unpack the parameters ( pc_alpha, contemp_collider_rule, conflict_resolution, reset_lagged_links, cond_ind_test_class, ) = a_pcmciplus_params_order_independence if cond_ind_test_class == 'oracle_ci': cond_ind_test = OracleCI(links_coeffs) elif cond_ind_test_class == 'par_corr': cond_ind_test = ParCorr() # Run the PCMCI algorithm with the given parameters pcmci = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test, verbosity=1) print("************************") print("\nTrue Graph") pcmci.print_significant_links(p_matrix=(true_graph == 0), val_matrix=true_graph, conf_matrix=None, q_matrix=None, graph=true_graph, ambiguous_triples=None, alpha_level=0.05) results = pcmci.run_pcmciplus( selected_links=None, tau_min=tau_min, tau_max=tau_max, pc_alpha=pc_alpha, contemp_collider_rule=contemp_collider_rule, conflict_resolution=conflict_resolution, reset_lagged_links=reset_lagged_links, max_conds_dim=None, max_conds_py=None, max_conds_px=None, ) correct_results = results['graph'] for perm in itertools.permutations(range(N)): print(perm) data_new = np.copy(data[:, perm]) dataframe = pp.DataFrame(data_new, var_names=list(perm)) pcmci = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test, verbosity=1) results = pcmci.run_pcmciplus( selected_links=None, tau_min=tau_min, tau_max=tau_max, pc_alpha=pc_alpha, contemp_collider_rule=contemp_collider_rule, conflict_resolution=conflict_resolution, reset_lagged_links=reset_lagged_links, max_conds_dim=None, max_conds_py=None, max_conds_px=None, ) tmp = np.take(correct_results, perm, axis=0) back_converted_result = np.take(tmp, perm, axis=1) for tau in range(tau_max + 1): if not np.allclose(results['graph'][:, :, tau], back_converted_result[:, :, tau]): print(tau) print(results['graph'][:, :, tau]) print(back_converted_result[:, :, tau]) print(back_converted_result[:, :, tau] - results['graph'][:, :, tau]) print(perm) # np.allclose(results['graph'], back_converted_result) np.testing.assert_equal(results['graph'], back_converted_result)
'tau_min': 0, # Maximum time lag 'tau_max': 10, # Maximum number of parents of X to condition on in MCI step, leave this to None # to condition on all estimated parents. 'max_conds_px': None, # Selected links may be used to restricted estimation to given links. 'selected_links': None, # Alpha level for MCI tests (just used for printing since all p-values are # stored anyway) 'alpha_level': 0.05, } } # Chosen conditional independence test cond_ind_test = ParCorr(verbosity=verbosity, **resdict['CI_params']) # significance='analytic', # use_mask=True, # mask_type=['y'], # recycle_residuals=True, # verbosity=verbosity) # Store results in file if os.path.expanduser('~') == '/home/rung_ja': file_name = '/home/rung_ja/Zwischenergebnisse/causal_robustness_cmip/results_%s_comps-%d_months-%s_%s_%s_%s.bin' % ( model, n_comps, months, method_arg, period_length, ip) elif os.path.expanduser('~') == '/home/peer': file_name = '/home/peer/Documents/analysis_many_members/pcmci/results/results_%s_comps-%d_months-%s_%s_%s_%s.bin' % ( model, n_comps, months, method_arg, period_length, ip) elif os.path.expanduser('~') == '/home/pjn': file_name = '/home/pjn/Documents/Imperial/new_coll_Jakob/analysis_many_members/pcmci/results/results_%s_comps-%d_months-%s_%s_%s_%s.bin' % (
class TestCondInd(): #unittest.TestCase): # def __init__(self): # pass def setUp(self): auto = 0.6 coeff = 0.6 T = 1000 numpy.random.seed(42) # True graph links_coeffs = { 0: [((0, -1), auto)], 1: [((1, -1), auto), ((0, -1), coeff)], 2: [((2, -1), auto), ((1, -1), coeff)] } self.data, self.true_parents_coeffs = pp.var_process(links_coeffs, T=T) T, N = self.data.shape self.ci_par_corr = ParCorr(use_mask=False, mask_type=None, significance='analytic', fixed_thres=None, sig_samples=10000, sig_blocklength=3, confidence='analytic', conf_lev=0.9, conf_samples=10000, conf_blocklength=1, recycle_residuals=False, verbosity=0) self.ci_gpdc = GPDC(significance='analytic', sig_samples=1000, sig_blocklength=1, confidence='bootstrap', conf_lev=0.9, conf_samples=100, conf_blocklength=None, use_mask=False, mask_type='y', recycle_residuals=False, verbosity=0) def test_construct_array(self): data = numpy.array([[0, 10, 20, 30], [1, 11, 21, 31], [2, 12, 22, 32], [3, 13, 23, 33], [4, 14, 24, 34], [5, 15, 25, 35], [6, 16, 26, 36]]) data_mask = numpy.array( [[0, 1, 1, 0], [0, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 1], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], dtype='bool') X = [(1, -1)] Y = [(0, 0)] Z = [(0, -1), (1, -2), (2, 0)] tau_max = 2 # No masking res = _construct_array(X=X, Y=Y, Z=Z, tau_max=tau_max, use_mask=False, data=data, mask=data_mask, missing_flag=None, mask_type=None, verbosity=verbosity) print res[0] numpy.testing.assert_almost_equal( res[0], numpy.array([[13, 14, 15], [4, 5, 6], [3, 4, 5], [12, 13, 14], [24, 25, 26]])) numpy.testing.assert_almost_equal(res[1], numpy.array([0, 1, 2, 2, 2])) # masking y res = _construct_array(X=X, Y=Y, Z=Z, tau_max=tau_max, use_mask=True, data=data, mask=data_mask, mask_type=['y'], verbosity=verbosity) print res[0] numpy.testing.assert_almost_equal( res[0], numpy.array([[13, 14, 15], [4, 5, 6], [3, 4, 5], [12, 13, 14], [24, 25, 26]])) numpy.testing.assert_almost_equal(res[1], numpy.array([0, 1, 2, 2, 2])) # masking all res = _construct_array(X=X, Y=Y, Z=Z, tau_max=tau_max, use_mask=True, data=data, mask=data_mask, mask_type=['x', 'y', 'z'], verbosity=verbosity) print res[0] numpy.testing.assert_almost_equal( res[0], numpy.array([[13, 14, 15], [4, 5, 6], [3, 4, 5], [12, 13, 14], [24, 25, 26]])) numpy.testing.assert_almost_equal(res[1], numpy.array([0, 1, 2, 2, 2])) def test_missing_values(self): data = numpy.array([ [0, 10, 20, 30], [1, 11, 21, 31], [2, 12, 22, 32], [3, 13, 999, 33], [4, 14, 24, 34], [5, 15, 25, 35], [6, 16, 26, 36], ]) data_mask = numpy.array( [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], dtype='bool') X = [(1, -2)] Y = [(0, 0)] Z = [(2, -1)] tau_max = 1 # Missing values res = _construct_array(X=X, Y=Y, Z=Z, tau_max=tau_max, use_mask=False, data=data, mask=data_mask, missing_flag=999, mask_type=['y'], verbosity=verbosity) # print res[0] numpy.testing.assert_almost_equal( res[0], numpy.array([[10, 14], [2, 6], [21, 25]])) def test_bootstrap_vs_analytic_confidence_parcorr(self): cov = numpy.array([[1., 0.3], [0.3, 1.]]) array = numpy.random.multivariate_normal(mean=numpy.zeros(2), cov=cov, size=150).T val = numpy.corrcoef(array)[0, 1] # print val dim, T = array.shape xyz = numpy.array([0, 1]) conf_ana = self.ci_par_corr.get_analytic_confidence( df=T - dim, value=val, conf_lev=self.ci_par_corr.conf_lev) conf_boots = self.ci_par_corr.get_bootstrap_confidence( array, xyz, dependence_measure=self.ci_par_corr.get_dependence_measure, conf_samples=self.ci_par_corr.conf_samples, conf_blocklength=self.ci_par_corr.conf_blocklength, conf_lev=self.ci_par_corr.conf_lev, ) print conf_ana print conf_boots numpy.testing.assert_allclose(numpy.array(conf_ana), numpy.array(conf_boots), atol=0.01) def test_shuffle_vs_analytic_significance_parcorr(self): cov = numpy.array([[1., 0.04], [0.04, 1.]]) array = numpy.random.multivariate_normal(mean=numpy.zeros(2), cov=cov, size=250).T # array = numpy.random.randn(3, 10) val = numpy.corrcoef(array)[0, 1] # print val dim, T = array.shape xyz = numpy.array([0, 1]) pval_ana = self.ci_par_corr.get_analytic_significance(value=val, T=T, dim=dim) pval_shuffle = self.ci_par_corr.get_shuffle_significance( array, xyz, val) # Adjust p-value for two-sided measures print pval_ana print pval_shuffle numpy.testing.assert_allclose(numpy.array(pval_ana), numpy.array(pval_shuffle), atol=0.01) def test__parcorr_get_single_residuals(self): target_var = 0 #numpy.array([True, False, False, False]) true_residual = numpy.random.randn(4, 1000) array = numpy.copy(true_residual) array[0] += 0.5 * array[2:].sum(axis=0) est_residual = self.ci_par_corr._get_single_residuals( array, target_var, standardize=False, return_means=False) # print est_residual[:10] # print true_residual[0, :10] numpy.testing.assert_allclose(est_residual, true_residual[0], atol=0.01) def test_par_corr(self): val_ana = 0.6 T = 1000 array = numpy.random.randn(5, T) cov = numpy.array([[1., val_ana], [val_ana, 1.]]) array[:2, :] = numpy.random.multivariate_normal(mean=numpy.zeros(2), cov=cov, size=T).T # Generate some confounding array[0] += 0.5 * array[2:].sum(axis=0) array[1] += 0.7 * array[2:].sum(axis=0) # print numpy.corrcoef(array)[0,1] # print val dim, T = array.shape xyz = numpy.array([0, 1, 2, 2, 2]) val_est = self.ci_par_corr.get_dependence_measure(array, xyz) print val_est print val_ana numpy.testing.assert_allclose(numpy.array(val_ana), numpy.array(val_est), atol=0.02) def test__gpdc_get_single_residuals(self): ci_test = self.ci_gpdc # ci_test = self.ci_par_corr c = .3 T = 1000 numpy.random.seed(42) def func(x): return x * (1. - 4. * x**0 * numpy.exp(-x**2 / 2.)) array = numpy.random.randn(3, T) array[1] += c * func(array[2]) #.sum(axis=0) xyz = numpy.array([0, 1] + [2 for i in range(array.shape[0] - 2)]) target_var = 1 dim, T = array.shape # array -= array.mean(axis=1).reshape(dim, 1) c_std = c #/array[1].std() # array /= array.std(axis=1).reshape(dim, 1) array_orig = numpy.copy(array) (est_residual, pred) = ci_test._get_single_residuals(array, target_var, standardize=False, return_means=True) # Testing that in the center the fit is good center = numpy.where(numpy.abs(array_orig[2]) < .7)[0] print(pred[center][:10]).round(2) print(c_std * func(array_orig[2][center])[:10]).round(2) numpy.testing.assert_allclose(pred[center], c_std * func(array_orig[2][center]), atol=0.2) def plot__gpdc_get_single_residuals(self): ####### ci_test = self.ci_gpdc # ci_test = self.ci_par_corr a = 0. c = .3 T = 500 # Each key refers to a variable and the incoming links are supplied as a # list of format [((driver, lag), coeff), ...] links_coeffs = { 0: [((0, -1), a)], 1: [((1, -1), a), ((0, -1), c)], } numpy.random.seed(42) data, true_parents_neighbors = pp.var_process(links_coeffs, use='inv_inno_cov', T=T) dataframe = pp.DataFrame(data) ci_test.set_dataframe(dataframe) # ci_test.set_tau_max(1) # X=[(1, -1)] # Y=[(1, 0)] # Z=[(0, -1)] + [(1, -tau) for tau in range(1, 2)] # array, xyz, XYZ = ci_test.get_array(X, Y, Z, # verbosity=0)] # ci_test.run_test(X, Y, Z,) def func(x): return x * (1. - 4. * x**0 * numpy.exp(-x**2 / 2.)) true_residual = numpy.random.randn(3, T) array = numpy.copy(true_residual) array[1] += c * func(array[2]) #.sum(axis=0) xyz = numpy.array([0, 1] + [2 for i in range(array.shape[0] - 2)]) print 'xyz ', xyz, numpy.where(xyz == 1) target_var = 1 dim, T = array.shape # array -= array.mean(axis=1).reshape(dim, 1) c_std = c #/array[1].std() # array /= array.std(axis=1).reshape(dim, 1) array_orig = numpy.copy(array) import matplotlib from matplotlib import pyplot (est_residual, pred) = ci_test._get_single_residuals(array, target_var, standardize=False, return_means=True) (resid_, pred_parcorr) = self.ci_par_corr._get_single_residuals( array, target_var, standardize=False, return_means=True) fig = pyplot.figure() ax = fig.add_subplot(111) ax.scatter(array_orig[2], array_orig[1]) ax.scatter(array_orig[2], pred, color='red') ax.scatter(array_orig[2], pred_parcorr, color='green') ax.plot(numpy.sort(array_orig[2]), c_std * func(numpy.sort(array_orig[2])), color='black') pyplot.savefig('/home/jakobrunge/test/gpdctest.pdf') def test_shuffle_vs_analytic_significance_gpdc(self): cov = numpy.array([[1., 0.2], [0.2, 1.]]) array = numpy.random.multivariate_normal(mean=numpy.zeros(2), cov=cov, size=245).T dim, T = array.shape xyz = numpy.array([0, 1]) val = self.ci_gpdc.get_dependence_measure(array, xyz) pval_ana = self.ci_gpdc.get_analytic_significance(value=val, T=T, dim=dim) pval_shuffle = self.ci_gpdc.get_shuffle_significance(array, xyz, val) print pval_ana print pval_shuffle numpy.testing.assert_allclose(numpy.array(pval_ana), numpy.array(pval_shuffle), atol=0.05) def test_shuffle_vs_analytic_significance_gpdc(self): cov = numpy.array([[1., 0.01], [0.01, 1.]]) array = numpy.random.multivariate_normal(mean=numpy.zeros(2), cov=cov, size=300).T dim, T = array.shape xyz = numpy.array([0, 1]) val = self.ci_gpdc.get_dependence_measure(array, xyz) pval_ana = self.ci_gpdc.get_analytic_significance(value=val, T=T, dim=dim) pval_shuffle = self.ci_gpdc.get_shuffle_significance(array, xyz, val) print pval_ana print pval_shuffle numpy.testing.assert_allclose(numpy.array(pval_ana), numpy.array(pval_shuffle), atol=0.05) def test_cmi_knn(self): ci_cmi_knn = CMIknn(use_mask=False, mask_type=None, significance='shuffle_test', fixed_thres=None, sig_samples=10000, sig_blocklength=3, knn=10, confidence='bootstrap', conf_lev=0.9, conf_samples=10000, conf_blocklength=1, verbosity=0) # ci_cmi_knn._trafo2uniform(self, x) val_ana = 0.6 T = 10000 numpy.random.seed(42) array = numpy.random.randn(5, T) cov = numpy.array([[1., val_ana], [val_ana, 1.]]) array[:2, :] = numpy.random.multivariate_normal(mean=numpy.zeros(2), cov=cov, size=T).T # Generate some confounding if len(array) > 2: array[0] += 0.5 * array[2:].sum(axis=0) array[1] += 0.7 * array[2:].sum(axis=0) # print numpy.corrcoef(array)[0,1] # print val dim, T = array.shape xyz = numpy.array([0, 1, 2, 2, 2]) val_est = ci_cmi_knn.get_dependence_measure(array, xyz) print val_est print _par_corr_to_cmi(val_ana) numpy.testing.assert_allclose(numpy.array(_par_corr_to_cmi(val_ana)), numpy.array(val_est), atol=0.02) def test_trafo2uniform(self): T = 1000 # numpy.random.seed(None) array = numpy.random.randn(2, T) bins = 10 uniform = self.ci_gpdc._trafo2uniform(array) # print uniform # import matplotlib # from matplotlib import pylab for i in range(array.shape[0]): print uniform[i].shape hist, edges = numpy.histogram(uniform[i], bins=bins, density=True) # pylab.figure() # pylab.hist(uniform[i], color='grey', alpha=0.3) # pylab.hist(array[i], alpha=0.3) # pylab.show() print hist / float(bins) #, edges numpy.testing.assert_allclose(numpy.ones(bins) / float(bins), hist / float(bins), atol=0.01) def test_cmi_symb(self): ci_cmi_symb = CMIsymb(use_mask=False, mask_type=None, significance='shuffle_test', fixed_thres=None, sig_samples=10000, sig_blocklength=3, confidence='bootstrap', conf_lev=0.9, conf_samples=10000, conf_blocklength=1, verbosity=0) val_ana = 0.6 T = 100000 numpy.random.seed(None) array = numpy.random.randn(3, T) cov = numpy.array([[1., val_ana], [val_ana, 1.]]) array[:2, :] = numpy.random.multivariate_normal(mean=numpy.zeros(2), cov=cov, size=T).T # Generate some confounding if len(array) > 2: array[0] += 0.5 * array[2:].sum(axis=0) array[1] += 0.7 * array[2:].sum(axis=0) # Transform to symbolic data array = pp.quantile_bin_array(array.T, bins=16).T dim, T = array.shape xyz = numpy.array([0, 1, 2, 2, 2]) val_est = ci_cmi_symb.get_dependence_measure(array, xyz) print val_est print _par_corr_to_cmi(val_ana) numpy.testing.assert_allclose(numpy.array(_par_corr_to_cmi(val_ana)), numpy.array(val_est), atol=0.02)
# Maximum number of parents of X to condition on in MCI step, leave this to None # to condition on all estimated parents. max_conds_px = None # Selected links may be used to restricted estimation to given links. selected_links = None # Alpha level for MCI tests (just used for printing since all p-values are # stored anyway) alpha_level = 0.05 # Verbosity level. Note that slaves will ouput on top of each other. verbosity = 0 # Chosen conditional independence test cond_ind_test = ParCorr() #confidence='analytic') # Store results in file file_name = os.path.expanduser('~') + '/test/test_results.dat' # # Start of the script # if COMM.rank == 0: # Only the master node (rank=0) runs this if verbosity > -1: print("\n##\n## Running Parallelized Tigramite PC algorithm\n##" "\n\nParameters:") print("\nindependence test = %s" % cond_ind_test.measure + "\ntau_min = %d" % tau_min
def test(dataframes,max_lags=[4],alpha=[None],tests=['ParCorr'],limit=1): ''' This function performs the PCMCI algorithm for all the dataframes received as parameters, given the hyper-parameters of the conditional independence test Args: dataframes: A list of TIGRAMITE dataframes max_lags: Maximum number of lags to consider for the laggd time series alpha: Significance level to perform the parent test tests: A list of conditional independence test to be performed limit: A limit for the instances to be considered Returns: ''' test_results = [] random.shuffle(dataframes) total = limit*len(max_lags)*len(alpha)*len(tests) data_frame_iter = iter(dataframes) tests_to_evaluate=[] if 'RCOT' in tests: rcot = RCOT() tests_to_evaluate.append(['RCOT',rcot]) if 'GPDC' in tests: gpdc = GPDC() tests_to_evaluate.append(['GPDC', gpdc]) if 'ParCorr' in tests: parcorr = ParCorr(significance='analytic') tests_to_evaluate.append(['ParCorr',parcorr]) if 'CMIknn' in tests: cmiknn = CMIknn() tests_to_evaluate.append(['CMIknn',cmiknn]) unique_complexities = list(set(l[1] for l in dataframes)) counts = {} for i in unique_complexities: counts[i] = 0 for test in tests_to_evaluate: stop = False for l in max_lags: for a in alpha: while not stop: try: i = random.sample(dataframes,1)[0] if counts[i[1]] < limit: print('evaluating: ' + str(i[3])) start = time.time() pcmci = PCMCI( dataframe=i[2], cond_ind_test=test[1], verbosity=0) # correlations = pcmci.get_lagged_dependencies(tau_max=20) pcmci.verbosity = 1 results = pcmci.run_pcmci(tau_max=l, pc_alpha=a) time_lapse = round(time.time() - start, 2) q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') valid_parents = list(pcmci.return_significant_parents(pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=a)['parents'].values()) flat_list = [] for sublist in valid_parents: for item in sublist: flat_list.append(item) valid_links = len(flat_list) test_results.append([i[3], i[0], i[1], l,test[0],a,valid_links,time_lapse]) results_df = pd.DataFrame(test_results, columns=['representation', 'complexity', 'sample_size', 'max_lag','test','alpha','valid_links_at_alpha', 'learning_time']) print('results ready to be saved') results_df.to_csv( 'results/performance_sample_sizes.csv', index=False) counts[i[1]] += 1 if all(value == limit for value in counts.values()): stop = True except: print('Hoopla!') pass for i in unique_complexities: counts[i] = 0
def df_data_remove_z(df_data, z_keys=[str, list], lag_z: [int, list] = [0], keys=None, standardize: bool = True, plot: bool = True): ''' Parameters ---------- df_data : pd.DataFrame DataFrame containing timeseries. z_keys : str, optional variable z, of which influence will be remove of columns in keys. The default is str. Returns ------- None. ''' method = ParCorr() if type(z_keys) is str: z_keys = [z_keys] if keys is None: discard = ['TrainIsTrue', 'RV_mask'] + z_keys keys = [k for k in df_data.columns if k not in discard] if hasattr(df_data.index, 'levels' ) == False: # create fake multi-index for standard data format df_data.index = pd.MultiIndex.from_product([[0], df_data.index]) if type(lag_z) is int: lag_z = [lag_z] max_lag = max(lag_z) dates = df_data.index.levels[1] df_z = df_data[z_keys] zlist = [] if 0 in lag_z: zlist = [df_z.loc[pd.IndexSlice[:, dates[max_lag:]], :]] [ zlist.append(df_z.loc[pd.IndexSlice[:, dates[max_lag - l:-l]], :]) for l in lag_z if l != 0 ] # update df_data to account for lags (first dates have no lag): df_data = df_data.loc[pd.IndexSlice[:, dates[max_lag:]], :] # align index dates for easy merging for d_ in zlist: d_.index = df_data.index df_z = pd.concat(zlist, axis=1).loc[pd.IndexSlice[:, dates[max_lag:]], :] # update columns with lag indication df_z.columns = [ f'{c[0]}_lag{c[1]}' for c in np.array(np.meshgrid(z_keys, lag_z)).T.reshape(-1, 2) ] npstore = np.zeros(shape=(len(keys), df_data.index.levels[0].size, dates[max_lag:].size)) for i, orig in enumerate(keys): orig = keys[i] # create fake X, Y format, needed for function _get_single_residuals dfxy = df_data[[orig]].merge(df_data[[orig ]].copy().rename({orig: 'copy'}, axis=1), left_index=True, right_index=True).copy() # Append Z timeseries dfxyz = dfxy.merge(df_z, left_index=True, right_index=True) for s in df_data.index.levels[0]: dfxyz_s = dfxyz.loc[s].copy() if all(dfxyz_s[orig].isna().values): npstore[i, s, :] = dfxyz_s[orig].values # fill in all nans else: npstore[i, s, :] = method._get_single_residuals( np.moveaxis(dfxyz_s.values, 0, 1), 0, standardize=standardize, return_means=True)[0] df_new = pd.DataFrame(np.moveaxis(npstore, 0, 2).reshape(-1, len(keys)), index=df_data.index, columns=keys) if plot: fig, axes = plt.subplots(len(keys), 1, figsize=(10, 2.5 * len(keys)), sharex=True) if len(keys) == 1: axes = [axes] for i, k in enumerate(keys): df_data[k].loc[0].plot(ax=axes[i], label=f'{k} original', legend=False, color='green', lw=1, alpha=.8) cols = ', '.join(c.replace('_', ' ') for c in df_z.columns) df_new[k].loc[0].plot(ax=axes[i], label=cols + ' regressed out', legend=False, color='blue', lw=1) axes[i].legend() out = (df_new, fig) else: out = (df_new) return out
T, N = data.shape # Initialize dataframe object, specify time axis and variable names #var_names = [r'$X^0$', r'$X^1$', r'$X^2$', r'$X^3$'] dataframe = pp.DataFrame(data, datatime=np.arange(len(data)), var_names=headers) if verbose > 0: plot = tp.plot_timeseries(dataframe)[0] if display_images: plot.show() if save_images: plot.savefig("timeseries.png") parcorr = ParCorr(significance='analytic') pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=1) correlations = pcmci.get_lagged_dependencies(tau_max=3) lag_func_matrix = tp.plot_lagfuncs(val_matrix=correlations, setup_args={ 'var_names': headers, 'x_base': 5, 'y_base': .5 }) if verbose > 1: if display_images: lag_func_matrix.savefig() if save_images: lag_func_matrix.savefig("lag_func.png")
def run_pcmci(data, data_mask, var_names, path_outsub2, s, tau_min=0, tau_max=1, pc_alpha=None, alpha_level=0.05, max_conds_dim=4, max_combinations=1, max_conds_py=None, max_conds_px=None, verbosity=4): #%% if path_outsub2 is not False: txt_fname = os.path.join(path_outsub2, f'split_{s}_PCMCI_out.txt') # from contextlib import redirect_stdout orig_stdout = sys.stdout # buffer print statement output to f sys.stdout = f = io.StringIO() #%% # ====================================================================================================================== # tigramite 4 # ====================================================================================================================== T, N = data.shape # Time, Regions # ====================================================================================================================== # Initialize dataframe object (needed for tigramite functions) # ====================================================================================================================== dataframe = pp.DataFrame(data=data, mask=data_mask, var_names=var_names) # ====================================================================================================================== # pc algorithm: only parents for selected_variables are calculated # ====================================================================================================================== parcorr = ParCorr(significance='analytic', mask_type='y', verbosity=verbosity) #========================================================================== # multiple testing problem: #========================================================================== pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, selected_variables=None, verbosity=verbosity) # selected_variables : list of integers, optional (default: range(N)) # Specify to estimate parents only for selected variables. If None is # passed, parents are estimated for all variables. # ====================================================================================================================== #selected_links = dictionary/None results = pcmci.run_pcmci(tau_max=tau_max, pc_alpha=pc_alpha, tau_min=tau_min, max_conds_dim=max_conds_dim, max_combinations=max_combinations, max_conds_px=max_conds_px, max_conds_py=max_conds_py) q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') pcmci.print_significant_links(p_matrix=results['p_matrix'], q_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=alpha_level) #%% if path_outsub2 is not False: file = io.open(txt_fname, mode='w+') file.write(f.getvalue()) file.close() f.close() sys.stdout = orig_stdout return pcmci, q_matrix, results