def test_pcmci(self):
        # Setting up strict test level
        pc_alpha = 0.05  #[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
        tau_max = 2
        alpha_level = 0.01

        dataframe = pp.DataFrame(self.data)

        cond_ind_test = ParCorr(verbosity=verbosity)

        pcmci = PCMCI(dataframe=dataframe,
                      cond_ind_test=cond_ind_test,
                      verbosity=verbosity)

        results = pcmci.run_pcmci(
            tau_max=tau_max,
            pc_alpha=pc_alpha,
        )

        parents = pcmci._return_significant_parents(
            pq_matrix=results['p_matrix'],
            val_matrix=results['val_matrix'],
            alpha_level=alpha_level)['parents']

        # print parents
        # print self.true_parents
        assert_graphs_equal(parents, self.true_parents)
    def test_mci(self):

        # Setting up strict test level
        pc_alpha = 0.05  #[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
        tau_max = 2
        alpha_level = 0.01

        dataframe = pp.DataFrame(self.data)

        cond_ind_test = ParCorr(verbosity=verbosity)

        pcmci = PCMCI(selected_variables=None,
                      dataframe=dataframe,
                      cond_ind_test=cond_ind_test,
                      verbosity=verbosity)

        results = pcmci.run_mci(
            selected_links=None,
            tau_min=1,
            tau_max=tau_max,
            parents=self.true_parents,
            max_conds_py=None,
            max_conds_px=None,
        )

        parents = pcmci._return_significant_parents(
            pq_matrix=results['p_matrix'],
            val_matrix=results['val_matrix'],
            alpha_level=alpha_level,
        )['parents']
        # print parents
        # print _get_parent_graph(true_parents)
        assert_graphs_equal(parents, self.true_parents)
    def test_pc_stable_max_conds_dim(self):

        # Setting up strict test level
        pc_alpha = 0.05  #[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
        tau_max = 2
        alpha_level = 0.01

        # true_parents_here = {0: [],
        #                1: [(1, -1), (0, -1)],
        #                2: []
        #                }

        dataframe = pp.DataFrame(self.data)

        cond_ind_test = ParCorr(verbosity=verbosity)

        pcmci = PCMCI(selected_variables=None,
                      dataframe=dataframe,
                      cond_ind_test=cond_ind_test,
                      verbosity=verbosity)

        pcmci.run_pc_stable(
            selected_links=None,
            tau_min=1,
            tau_max=tau_max,
            save_iterations=False,
            pc_alpha=pc_alpha,
            max_conds_dim=2,
            max_combinations=1,
        )

        parents = pcmci.all_parents
        # print parents
        # print _get_parent_graph(true_parents)
        assert_graphs_equal(parents, self.true_parents)
def create_tigramite_dataframe(dataset, exclude=None):
    ''' Creates a TIGRAMITE datframe from a pandas dataframe

    Args:
        dataset: A pandas dataframe with a timestamp column in it an numeric measures
        exclude: A list of columns to be excluded from the TIGRAMITEs dataframe

    Returns: A TIGRAMITE dataframe

    '''
    must = ['timestamp']
    var_list = list(dataset)
    if exclude != None:
        if all([i in var_list for i in exclude]):
            for i in exclude:
                var_list.remove(i)
        else:
            print('Oops! are you sure all the columns to exclude exist in the dataframe?, maybe check spelling')
            raise Exception
    else:
        pass

    data = dataset[var_list]

    if 'timestamp' in list(dataset):
        datatime = dataset["timestamp"]
    else:
        print('Oops! are you sure you included <timestamp> in the dataframe?, maybe check spelling')
        raise Exception

    dataframe = pp.DataFrame(data.values, datatime=datatime.values, var_names=var_list)
    return dataframe, var_list
Esempio n. 5
0
def gen_data_frame(links_coeffs, time, seed_val):
    # Set the random seed
    np.random.seed(seed_val)
    # Generate the data
    data, _ = pp.var_process(links_coeffs, T=time)
    # Get the true parents
    true_parents = _get_parent_graph(links_coeffs)
    return pp.DataFrame(data), true_parents
Esempio n. 6
0
def pcmci_setup(data):
	dataframe = pp.DataFrame(data.values, var_names=list(data.columns))
	parcorr = ParCorr(significance='analytic')
	pcmci = PCMCI(
		dataframe=dataframe,
		cond_ind_test=parcorr,
		verbosity=1)
	return pcmci
 def _set_dataframe(self, dataset):
     dataframe = pp.DataFrame(dataset)
     # Set the data for this iteration of the algorithm
     self.dataframe = dataframe
     # Store the shape of the data in the T and N variables
     self.T, self.N = self.dataframe.values.shape
     # Some checks
     if (np.any(np.array(self.selected_variables) < 0)
             or np.any(np.array(self.selected_variables) >= self.N)):
         raise ValueError("selected_variables must be within 0..N-1")
Esempio n. 8
0
def a_sample(request):
    # Set the parameters
    links_coeffs, time, seed_val = request.param
    # Set the random seed
    np.random.seed(seed_val)
    # Generate the data
    data, _ = pp.var_process(links_coeffs, T=time)
    # Get the true parents
    true_parents = _get_parent_graph(links_coeffs)
    return pp.DataFrame(data), true_parents
Esempio n. 9
0
def a_pcmciplus_order_independence(request):
    # Set the parameters
    links_coeffs, time, seed_val = request.param

    # Retrieve lags
    tau_min, tau_max = pp._get_minmax_lag(links_coeffs)
    # Generate the data
    data, _ = pp.structural_causal_process(links=links_coeffs,
                                           T=time,
                                           noises=None,
                                           seed=seed_val)
    # Get the true parents
    # true_parents = pp._get_parents(links_coeffs, exclude_contemp=False)
    true_graph = pp.links_to_graph(links_coeffs, tau_max=tau_max)
    return pp.DataFrame(data), true_graph, links_coeffs, tau_min, tau_max
Esempio n. 10
0
def bootstrapping_ar_model(model, num_bs=200, seed=52):
    

    T, N = model.data.shape
    
    std_data = np.zeros_like(model.data)
    #standardize
    for i in range(N):
        std_data[:,i] = (model.data[:,i] - model.data[:,i].mean())/model.data[:,i].std()
        
    # initial model coeffs
    phi = model.phi
    
    tau_max = phi.shape[0] - 1 
    
    residuals = np.zeros((T-tau_max, N))
    
    for i in range(T-tau_max):
        
        model_eval = np.zeros((1,N))
        for tau in range(1, tau_max+1):
            model_eval += np.dot(phi[tau],std_data[i+tau_max-tau])
            
        residuals[i,:] = std_data[i+tau_max,:] - model_eval
    
    # generate bootstrap data
    bs_models = []
    ts_indexes = np.arange(residuals.shape[0])
    np.random.seed(seed)
    for _ in range(num_bs):
        bs_residuals = residuals[np.random.choice(ts_indexes, size=T, replace=True),:]
        # bs model
        bs_x = np.zeros((T, N))
        for t in range(0,T):
            if t < tau_max:
                bs_x[t,:] = bs_residuals[t,:]
            else:
                model_eval = np.zeros((1,N))
                for tau in range(1, tau_max+1):
                    model_eval += np.dot(phi[tau],bs_x[t-tau])
                    
                bs_x[t,:] = model_eval + bs_residuals[t,:]
        #fit bs data
        bs_med = LinearMediation(dataframe=pp.DataFrame(data=bs_x)) #, data_transform=False)
        bs_med.fit_model(all_parents=parent_dict['parents'])
        bs_models.append(bs_med)
    
    return bs_models
Esempio n. 11
0
def calculate(para_setup):

    para_setup_string, sam = para_setup

    paras = para_setup_string.split('-')
    paras = [w.replace("'", "") for w in paras]

    model = str(paras[0])
    N = int(paras[1])
    n_links = int(paras[2])
    min_coeff = float(paras[3])
    coeff = float(paras[4])
    auto = float(paras[5])
    contemp_fraction = float(paras[6])
    frac_unobserved = float(paras[7])
    max_true_lag = int(paras[8])
    T = int(paras[9])

    ci_test = str(paras[10])
    method = str(paras[11])
    pc_alpha = float(paras[12])
    tau_max = int(paras[13])

    #############################################
    ##  Data
    #############################################

    def lin_f(x):
        return x

    def f2(x):
        return (x + 5. * x**2 * np.exp(-x**2 / 20.))

    if model == 'autobidirected':
        if verbosity > 999:
            model_seed = verbosity - 1000
        else:
            model_seed = sam

        random_state = np.random.RandomState(model_seed)

        links = {
            0: [((0, -1), auto, lin_f), ((1, -1), coeff, lin_f)],
            1: [],
            2: [((2, -1), auto, lin_f), ((1, -1), coeff, lin_f)],
            3: [((3, -1), auto, lin_f), ((2, -1), min_coeff, lin_f)],
        }
        observed_vars = [0, 2, 3]

        noises = [random_state.randn for j in range(len(links))]

        data_all, nonstationary = mod.generate_nonlinear_contemp_timeseries(
            links=links, T=T, noises=noises, random_state=random_state)
        data = data_all[:, observed_vars]

    elif 'random' in model:
        if 'lineargaussian' in model:

            coupling_funcs = [lin_f]

            noise_types = ['gaussian']  #, 'weibull', 'uniform']
            noise_sigma = (0.5, 2)

        elif 'nonlinearmixed' in model:

            coupling_funcs = [lin_f, f2]

            noise_types = ['gaussian', 'gaussian', 'weibull']
            noise_sigma = (0.5, 2)

        if coeff < min_coeff:
            min_coeff = coeff
        couplings = list(np.arange(min_coeff, coeff + 0.1, 0.1))
        couplings += [-c for c in couplings]

        auto_deps = list(np.arange(max(0., auto - 0.3), auto + 0.01, 0.05))

        # Models may be non-stationary. Hence, we iterate over a number of seeds
        # to find a stationary one regarding network topology, noises, etc
        if verbosity > 999:
            model_seed = verbosity - 1000
        else:
            model_seed = sam

        for ir in range(1000):
            # np.random.seed(model_seed)
            random_state = np.random.RandomState(model_seed)

            N_all = math.floor((N / (1. - frac_unobserved)))
            n_links_all = math.ceil(n_links / N * N_all)
            observed_vars = np.sort(
                random_state.choice(range(N_all),
                                    size=math.ceil(
                                        (1. - frac_unobserved) * N_all),
                                    replace=False)).tolist()

            links = mod.generate_random_contemp_model(
                N=N_all,
                L=n_links_all,
                coupling_coeffs=couplings,
                coupling_funcs=coupling_funcs,
                auto_coeffs=auto_deps,
                tau_max=max_true_lag,
                contemp_fraction=contemp_fraction,
                # num_trials=1000,
                random_state=random_state)

            class noise_model:
                def __init__(self, sigma=1):
                    self.sigma = sigma

                def gaussian(self, T):
                    # Get zero-mean unit variance gaussian distribution
                    return self.sigma * random_state.randn(T)

                def weibull(self, T):
                    # Get zero-mean sigma variance weibull distribution
                    a = 2
                    mean = scipy.special.gamma(1. / a + 1)
                    variance = scipy.special.gamma(
                        2. / a + 1) - scipy.special.gamma(1. / a + 1)**2
                    return self.sigma * (random_state.weibull(a=a, size=T) -
                                         mean) / np.sqrt(variance)

                def uniform(self, T):
                    # Get zero-mean sigma variance uniform distribution
                    mean = 0.5
                    variance = 1. / 12.
                    return self.sigma * (random_state.uniform(size=T) -
                                         mean) / np.sqrt(variance)

            noises = []
            for j in links:
                noise_type = random_state.choice(noise_types)
                sigma = noise_sigma[0] + (
                    noise_sigma[1] - noise_sigma[0]) * random_state.rand()
                noises.append(getattr(noise_model(sigma), noise_type))

            if 'discretebinom' in model:
                if 'binom2' in model:
                    n_binom = 2
                elif 'binom4' in model:
                    n_binom = 4

                data_all_check, nonstationary = discretized_scp(
                    links=links,
                    T=T + 10000,
                    n_binom=n_binom,
                    random_state=random_state)
            else:
                data_all_check, nonstationary = mod.generate_nonlinear_contemp_timeseries(
                    links=links,
                    T=T + 10000,
                    noises=noises,
                    random_state=random_state)

            # If the model is stationary, break the loop
            if not nonstationary:
                data_all = data_all_check[:T]
                data = data_all[:, observed_vars]
                break
            else:
                print("Trial %d: Not a stationary model" % ir)
                model_seed += 10000
    else:
        raise ValueError("model %s not known" % model)

    if nonstationary:
        raise ValueError("No stationary model found: %s" % model)

    true_graph = utilities._get_pag_from_dag(links,
                                             observed_vars=observed_vars,
                                             tau_max=tau_max,
                                             verbosity=verbosity)[1]

    if verbosity > 0:
        print("True Links")
        for j in links:
            print(j, links[j])
        print("observed_vars = ", observed_vars)
        print("True PAG")
        if tau_max > 0:
            for lag in range(tau_max + 1):
                print(true_graph[:, :, lag])
        else:
            print(true_graph.squeeze())

    if plot_data:
        print("PLOTTING")
        for j in range(N):
            # ax = fig.add_subplot(N,1,j+1)
            pyplot.plot(data[:, j])

        pyplot.show()

    computation_time_start = time.time()

    dataframe = pp.DataFrame(data)

    #############################################
    ##  Methods
    #############################################

    # Specify conditional independence test object
    if ci_test == 'par_corr':
        cond_ind_test = ParCorr(significance='analytic',
                                recycle_residuals=True)
    elif ci_test == 'cmi_knn':
        cond_ind_test = CMIknn(knn=0.1, sig_samples=500, sig_blocklength=1)
    elif ci_test == 'gp_dc':
        cond_ind_test = GPDC(recycle_residuals=True)
    elif ci_test == 'discg2':
        cond_ind_test = DiscG2()
    else:
        raise ValueError("CI test not recognized.")

    if 'lpcmci' in method:
        method_paras = method.split('_')
        n_preliminary_iterations = int(method_paras[1][7:])

        if 'prelimonly' in method: prelim_only = True
        else: prelim_only = False

        lpcmci = LPCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test)

        lpcmcires = lpcmci.run_lpcmci(
            tau_max=tau_max,
            pc_alpha=pc_alpha,
            max_p_non_ancestral=3,
            n_preliminary_iterations=n_preliminary_iterations,
            prelim_only=prelim_only,
            verbosity=verbosity)

        graph = lpcmci.graph
        val_min = lpcmci.val_min_matrix
        max_cardinality = lpcmci.cardinality_matrix

    elif method == 'svarfci':
        svarfci = SVARFCI(dataframe=dataframe, cond_ind_test=cond_ind_test)
        svarfcires = svarfci.run_svarfci(
            tau_max=tau_max,
            pc_alpha=pc_alpha,
            max_cond_px=0,
            max_p_dsep=3,
            fix_all_edges_before_final_orientation=True,
            verbosity=verbosity)

        graph = svarfci.graph
        val_min = svarfci.val_min_matrix
        max_cardinality = svarfci.cardinality_matrix

    elif method == 'svarrfci':
        svarrfci = SVARRFCI(dataframe=dataframe, cond_ind_test=cond_ind_test)

        svarrfcires = svarrfci.run_svarrfci(
            tau_max=tau_max,
            pc_alpha=pc_alpha,
            fix_all_edges_before_final_orientation=True,
            verbosity=verbosity)

        graph = svarrfci.graph
        val_min = svarrfci.val_min_matrix
        max_cardinality = svarrfci.cardinality_matrix
    else:
        raise ValueError("%s not implemented." % method)

    computation_time_end = time.time()
    computation_time = computation_time_end - computation_time_start

    return {
        'true_graph': true_graph,
        'val_min': val_min,
        'max_cardinality': max_cardinality,

        # Method results
        'computation_time': computation_time,
        'graph': graph,
    }
np.random.seed(42)
links_coeffs = {
    0: [((0, -1), 0.7), ((1, -1), -0.8)],
    1: [((1, -1), 0.8), ((3, -1), 0.8)],
    2: [((2, -1), 0.5), ((1, -2), 0.5), ((3, -3), 0.6)],
    3: [((3, -1), 0.4)],
}
T = 1000
data, true_parents_neighbors = pp.var_process(links_coeffs, T=T)

# In[3]:

T, N = data.shape
var_names = [r'$X^0$', r'$X^1$', r'$X^2$', r'$X^3$']
dataframe = pp.DataFrame(data,
                         datatime=np.arange(len(data)),
                         var_names=var_names)

# In[4]:

data.shape

# In[5]:

tp.plot_timeseries(dataframe)
plt.show()

# In[6]:

parcorr = ParCorr(significance='analytic')
pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=1)
Esempio n. 13
0
def run_pcmci(data, data_mask, var_names, path_outsub2, s, tau_min=0, tau_max=1, 
              pc_alpha=None, alpha_level=0.05, max_conds_dim=4, max_combinations=1, 
              max_conds_py=None, max_conds_px=None, verbosity=4):
    

    
    #%%
    if path_outsub2 is not False:
        txt_fname = os.path.join(path_outsub2, f'split_{s}_PCMCI_out.txt')
#        from contextlib import redirect_stdout
        orig_stdout = sys.stdout
        # buffer print statement output to f
        sys.stdout = f = io.StringIO()
    #%%            
    # ======================================================================================================================
    # tigramite 4
    # ======================================================================================================================

    T, N = data.shape # Time, Regions
    # ======================================================================================================================
    # Initialize dataframe object (needed for tigramite functions)
    # ======================================================================================================================
    dataframe = pp.DataFrame(data=data, mask=data_mask, var_names=var_names)
    # ======================================================================================================================
    # pc algorithm: only parents for selected_variables are calculated
    # ======================================================================================================================

    parcorr = ParCorr(significance='analytic',
                      mask_type='y',
                      verbosity=verbosity)
    #==========================================================================
    # multiple testing problem:
    #==========================================================================
    pcmci   = PCMCI(dataframe=dataframe,
                    cond_ind_test=parcorr,
                    selected_variables=None,
                    verbosity=verbosity)

    # selected_variables : list of integers, optional (default: range(N))
    #    Specify to estimate parents only for selected variables. If None is
    #    passed, parents are estimated for all variables.

    # ======================================================================================================================
    #selected_links = dictionary/None
    results = pcmci.run_pcmci(tau_max=tau_max, pc_alpha=pc_alpha, tau_min=tau_min,
                              max_conds_dim=max_conds_dim, 
                              max_combinations=max_combinations,
                              max_conds_px=max_conds_px,
                              max_conds_py=max_conds_py)

    q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh')

    pcmci.print_significant_links(p_matrix=results['p_matrix'],
                                   q_matrix=q_matrix,
                                   val_matrix=results['val_matrix'],
                                   alpha_level=alpha_level)
    #%%
    if path_outsub2 is not False:
        file = io.open(txt_fname, mode='w+')
        file.write(f.getvalue())
        file.close()
        f.close()

        sys.stdout = orig_stdout


    return pcmci, q_matrix, results
Esempio n. 14
0
def test_order_independence_pcmciplus(a_pcmciplus_order_independence,
                                      a_pcmciplus_params_order_independence):
    # Unpack the pcmci and the true parents, and common parameters
    dataframe, true_graph, links_coeffs, tau_min, tau_max = \
        a_pcmciplus_order_independence

    data = dataframe.values
    T, N = data.shape

    # Unpack the parameters
    (
        pc_alpha,
        contemp_collider_rule,
        conflict_resolution,
        reset_lagged_links,
        cond_ind_test_class,
    ) = a_pcmciplus_params_order_independence

    if cond_ind_test_class == 'oracle_ci':
        cond_ind_test = OracleCI(links_coeffs)
    elif cond_ind_test_class == 'par_corr':
        cond_ind_test = ParCorr()

    # Run the PCMCI algorithm with the given parameters
    pcmci = PCMCI(dataframe=dataframe,
                  cond_ind_test=cond_ind_test,
                  verbosity=1)
    print("************************")
    print("\nTrue Graph")
    pcmci.print_significant_links(p_matrix=(true_graph == 0),
                                  val_matrix=true_graph,
                                  conf_matrix=None,
                                  q_matrix=None,
                                  graph=true_graph,
                                  ambiguous_triples=None,
                                  alpha_level=0.05)

    results = pcmci.run_pcmciplus(
        selected_links=None,
        tau_min=tau_min,
        tau_max=tau_max,
        pc_alpha=pc_alpha,
        contemp_collider_rule=contemp_collider_rule,
        conflict_resolution=conflict_resolution,
        reset_lagged_links=reset_lagged_links,
        max_conds_dim=None,
        max_conds_py=None,
        max_conds_px=None,
    )
    correct_results = results['graph']

    for perm in itertools.permutations(range(N)):

        print(perm)
        data_new = np.copy(data[:, perm])
        dataframe = pp.DataFrame(data_new, var_names=list(perm))
        pcmci = PCMCI(dataframe=dataframe,
                      cond_ind_test=cond_ind_test,
                      verbosity=1)
        results = pcmci.run_pcmciplus(
            selected_links=None,
            tau_min=tau_min,
            tau_max=tau_max,
            pc_alpha=pc_alpha,
            contemp_collider_rule=contemp_collider_rule,
            conflict_resolution=conflict_resolution,
            reset_lagged_links=reset_lagged_links,
            max_conds_dim=None,
            max_conds_py=None,
            max_conds_px=None,
        )

        tmp = np.take(correct_results, perm, axis=0)
        back_converted_result = np.take(tmp, perm, axis=1)

        for tau in range(tau_max + 1):
            if not np.allclose(results['graph'][:, :, tau],
                               back_converted_result[:, :, tau]):
                print(tau)
                print(results['graph'][:, :, tau])
                print(back_converted_result[:, :, tau])
                print(back_converted_result[:, :, tau] -
                      results['graph'][:, :, tau])
                print(perm)

        # np.allclose(results['graph'], back_converted_result)
        np.testing.assert_equal(results['graph'], back_converted_result)
Esempio n. 15
0
def run_PCMCI(ex, outdic_actors, s, df_splits, map_proj):
    #=====================================================================================
    #
    # 4) PCMCI-algorithm
    #
    #=====================================================================================

    # save output
    if ex['SaveTF'] == True:
        #        from contextlib import redirect_stdout
        orig_stdout = sys.stdout
        # buffer print statement output to f
        if sys.version[:1] == '3':
            sys.stdout = f = io.StringIO()
        elif sys.version[:1] == '2':
            sys.stdout = f = open(os.path.join(ex['fig_subpath'], 'old.txt'),
                                  'w+')

#%%
# amount of text printed:
    verbosity = 3

    # alpha level for independence test within the pc procedure (finding parents)
    pc_alpha = ex['pcA_sets'][ex['pcA_set']]
    # alpha level for multiple linear regression model while conditining on parents of
    # parents
    alpha_level = ex['alpha_level_tig']
    print('run tigramite 4, run.pcmci')
    print(('alpha level(s) for independence tests within the pc procedure'
           '(finding parents): {}'.format(pc_alpha)))
    print((
        'alpha level for multiple linear regression model while conditining on parents of '
        'parents: {}'.format(ex['alpha_level_tig'])))

    # Retrieve traintest info
    traintest = df_splits

    # load Response Variable class
    RV = ex[ex['RV_name']]
    # create list with all actors, these will be merged into the fulldata array
    allvar = ex['vars'][0]
    var_names_corr = []
    actorlist = []
    cols = [[RV.name]]

    for var in allvar[:]:
        print(var)
        actor = outdic_actors[var]
        if actor.ts_corr[s].size != 0:
            ts_train = actor.ts_corr[s].values
            actorlist.append(ts_train)
            # create array which numbers the regions
            var_idx = allvar.index(var)
            n_regions = actor.ts_corr[s].shape[1]
            actor.var_info = [[i + 1, actor.ts_corr[s].columns[i], var_idx]
                              for i in range(n_regions)]
            # Array of corresponing regions with var_names_corr (first entry is RV)
            var_names_corr = var_names_corr + actor.var_info
            cols.append(list(actor.ts_corr[s].columns))
            index_dates = actor.ts_corr[s].index
    var_names_corr.insert(0, RV.name)

    # stack actor time-series together:
    fulldata = np.concatenate(tuple(actorlist), axis=1)

    print(('There are {} regions in total'.format(fulldata.shape[1])))
    # add the full 1D time series of interest as first entry:

    fulldata = np.column_stack((RV.RVfullts, fulldata))
    df_data = pd.DataFrame(fulldata, columns=flatten(cols), index=index_dates)

    if ex['import_prec_ts'] == True:
        var_names_full = var_names_corr.copy()
        for d in ex['precursor_ts']:
            path_data = d[1]
            if len(path_data) > 1:
                path_data = ''.join(list(path_data))
            # skip first col because it is the RV ts
            df_data_ext = func_fc.load_hdf5(
                path_data)['df_data'].iloc[:, 1:].loc[s]
            cols_ts = np.logical_or(df_data_ext.dtypes == 'float64',
                                    df_data_ext.dtypes == 'float32')
            cols_ext = list(df_data_ext.columns[cols_ts])
            # cols_ext must be of format '{}_{int}_{}'
            lab_int = 100
            for i, c in enumerate(cols_ext):
                char = c.split('_')[1]
                if char.isdigit():
                    pass
                else:
                    cols_ext[i] = c.replace(char, str(lab_int)) + char
                    lab_int += 1

            df_data_ext = df_data_ext[cols_ext]
            to_freq = ex['tfreq']
            if to_freq != 1:
                start_end_date = (ex['sstartdate'], ex['senddate'])
                start_end_year = (ex['startyear'], ex['endyear'])
            df_data_ext = functions_pp.time_mean_bins(df_data_ext,
                                                      to_freq,
                                                      start_end_date,
                                                      start_end_year,
                                                      seldays='part')[0]
            #            df_data_ext = functions_pp.time_mean_bins(df_data_ext,
            #                                                     ex, ex['tfreq'],
            #                                                     seldays='part')[0]
            # Expand var_names_corr
            n = var_names_full[-1][0] + 1
            add_n = n + len(cols_ext)
            n_var_idx = var_names_full[-1][-1] + 1
            for i in range(n, add_n):
                var_names_full.append([i, cols_ext[i - n], n_var_idx])
            df_data = df_data.merge(df_data_ext,
                                    left_index=True,
                                    right_index=True)
    else:
        var_names_full = var_names_corr

    bool_train = traintest.loc[s]['TrainIsTrue']
    bool_RV_train = np.logical_and(bool_train, traintest.loc[s]['RV_mask'])
    dates_train = traintest.loc[s]['TrainIsTrue'][bool_train].index
    dates_RV_train = traintest.loc[s]['TrainIsTrue'][bool_RV_train].index

    RVfull_train = RV.RVfullts.sel(time=dates_train)
    datesfull_train = pd.to_datetime(RVfull_train.time.values)
    data = df_data.loc[datesfull_train].values
    print((data.shape))

    # get RV datamask (same shape als data)
    data_mask = [
        True if d in dates_RV_train else False for d in datesfull_train
    ]
    data_mask = np.repeat(data_mask, data.shape[1]).reshape(data.shape)

    # add traintest mask to fulldata
    #    dates_all = pd.to_datetime(RV.RVfullts.index)
    #    dates_RV  = pd.to_datetime(RV.RV_ts.index)
    dates_all = pd.to_datetime(RV.RVfullts.time.values)
    dates_RV = pd.to_datetime(RV.RV_ts.time.values)
    df_data['TrainIsTrue'] = [
        True if d in datesfull_train else False for d in dates_all
    ]
    df_data['RV_mask'] = [True if d in dates_RV else False for d in dates_all]

    # ======================================================================================================================
    # tigramite 3
    # ======================================================================================================================

    T, N = data.shape  # Time, Regions
    # ======================================================================================================================
    # Initialize dataframe object (needed for tigramite functions)
    # ======================================================================================================================
    dataframe = pp.DataFrame(data=data,
                             mask=data_mask,
                             var_names=var_names_full)
    # ======================================================================================================================
    # pc algorithm: only parents for selected_variables are calculated
    # ======================================================================================================================

    parcorr = ParCorr(significance='analytic',
                      mask_type='y',
                      verbosity=verbosity)
    #==========================================================================
    # multiple testing problem:
    #==========================================================================
    pcmci = PCMCI(dataframe=dataframe,
                  cond_ind_test=parcorr,
                  selected_variables=None,
                  verbosity=4)

    # selected_variables : list of integers, optional (default: range(N))
    #    Specify to estimate parents only for selected variables. If None is
    #    passed, parents are estimated for all variables.

    # ======================================================================================================================
    #selected_links = dictionary/None
    results = pcmci.run_pcmci(tau_max=ex['tigr_tau_max'],
                              pc_alpha=pc_alpha,
                              tau_min=0,
                              max_combinations=ex['max_comb_actors'])

    q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'],
                                           fdr_method='fdr_bh')

    pcmci.print_significant_links(p_matrix=results['p_matrix'],
                                  q_matrix=q_matrix,
                                  val_matrix=results['val_matrix'],
                                  alpha_level=alpha_level)

    # returns all parents, not just causal precursors (of lag>0)
    sig = rgcpd.return_sign_parents(pcmci,
                                    pq_matrix=q_matrix,
                                    val_matrix=results['val_matrix'],
                                    alpha_level=alpha_level)

    all_parents = sig['parents']
    #    link_matrix = sig['link_matrix']

    links_RV = all_parents[0]

    df = rgcpd.bookkeeping_precursors(links_RV, var_names_full)
    #%%

    rgcpd.print_particular_region_new(links_RV, var_names_corr, s,
                                      outdic_actors, map_proj, ex)

    #%%
    if ex['SaveTF'] == True:
        if sys.version[:1] == '3':
            fname = f's{s}_' + ex['params'] + '.txt'
            file = io.open(os.path.join(ex['fig_subpath'], fname), mode='w+')
            file.write(f.getvalue())
            file.close()
            f.close()
        elif sys.version[:1] == '2':
            f.close()
        sys.stdout = orig_stdout

    return df, df_data
Esempio n. 16
0

# Example data, here the real dataset can be loaded as a numpy array of shape
# (T, N)
numpy.random.seed(42)     # Fix random seed
links_coeffs = {0: [((0, -1), 0.7)],
                1: [((1, -1), 0.8), ((0, -1), 0.8)],
                2: [((2, -1), 0.5), ((1, -2), 0.5)],
                }

T = 500     # time series length
data, true_parents_neighbors = pp.var_process(links_coeffs, T=T)
T, N = data.shape

# Initialize dataframe object
dataframe = pp.DataFrame(data)

# Optionally specify variable names
var_names = [r'$X^0$', r'$X^1$', r'$X^2$', r'$X^3$']

# Significance level in condition-selection step. If a list of levels is is
# provided or pc_alpha=None, the optimal pc_alpha is automatically chosen via
# model-selection.
pc_alpha = 0.2  # [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
selected_variables = range(N)  #[2] # [2]  # [2]

# Maximum time lag
tau_max = 3

# Optional minimum time lag in MCI step (in PC-step this is 1)
tau_min = 0
Esempio n. 17
0
def _get_pag_from_dag(links_coeffs,
                      observed_vars=None,
                      tau_max=None,
                      verbosity=0):
    """Computes PAG over observed variables from DAG on full variable set.
    
       Uses OracleCI tests based on ancestors in DAG to obtain skeleton and sepsets.
       Then applies FCI rules (including collider rule).
    """

    if verbosity > 0:
        print("Running _get_pag_from_dag:\n\n1. Ancestors search")

    N_all = len(links_coeffs)

    # If tau_max is None, compute from links_coeffs
    _, max_lag_links = _get_minmax_lag(links_coeffs)
    if tau_max is None:
        tau_max = max_lag_links
    else:
        if max_lag_links > tau_max:
            raise ValueError(
                "tau_max must be >= maximum lag in links_coeffs; choose tau_max=None"
            )

    if observed_vars is None:
        observed_vars = range(N_all)
    else:
        if not set(observed_vars).issubset(set(range(N_all))):
            raise ValueError("observed_vars must be subset of range(N_all).")

    N = len(observed_vars)
    # Init cond_ind_test class
    cond_ind_test = OracleCI(links_coeffs)

    # Init graph and sepsets
    graph_dict = {
        j: {(i, -tau): "o-o"
            for i in range(N) for tau in range(tau_max + 1)
            if tau > 0 or j != i}
        for j in range(N)
    }
    sepsets = {
        j: {(i, -tau): {}
            for i in range(N) for tau in range(tau_max + 1)
            if (tau > 0 or i < j)}
        for j in range(N)
    }

    sepset_answers = {}

    # We will enumerate the observed variables with (i,j) which refers to the index in pag_graph
    # while x, y iterates through the oberved variables in the underlying DAG

    # Loop over the observed variables
    for j, y in enumerate(observed_vars):

        for i, x in enumerate(observed_vars):
            for tau in range(0, tau_max + 1):
                if (x, -tau) != (y, 0):

                    dag_anc_y, _ = cond_ind_test._get_non_blocked_ancestors(
                        Y=[(y, 0)],
                        conds=None,
                        mode='max_lag',
                        max_lag=tau_max)
                    # Only consider observed ancestors
                    pag_anc_y = [
                        anc for anc in dag_anc_y[(y, 0)]
                        if anc[0] in observed_vars
                    ]

                    dag_anc_x, _ = cond_ind_test._get_non_blocked_ancestors(
                        Y=[(x, -tau)],
                        conds=None,
                        mode='max_lag',
                        max_lag=tau_max)

                    # Only consider observed ancestors
                    pag_anc_x = [
                        anc for anc in dag_anc_x[(x, -tau)]
                        if anc[0] in observed_vars
                    ]

                    Z = list(
                        set([
                            z for z in pag_anc_y + pag_anc_x
                            if z != (y, 0) and z != (x, -tau)
                        ]))

                    separated = cond_ind_test._is_dsep(X=[(x, -tau)],
                                                       Y=[(y, 0)],
                                                       Z=Z,
                                                       max_lag=None)

                    # If X and Y are connected given Z, mark a link
                    if not separated and tau == 0:
                        graph_dict[j][(i, -tau)] = "o-o"
                    elif not separated and tau > 0:
                        graph_dict[j][(i, -tau)] = "o->"
                    # If X and Y are separated given Z, mark absence of links and store sepset
                    else:
                        graph_dict[j][(i, -tau)] = ""

                        # Translate sepset to (i,j)-space
                        S = frozenset((observed_vars.index(cond[0]), cond[1])
                                      for cond in Z)
                        #sepsets[j][(i, -tau)] = {(S, "wm")}

                        sepsets[j][(i, -tau)] = {(S, "")}
                        if tau == 0:
                            #sepsets[i][(j, 0)] = {(S, "wm")}
                            sepsets[i][(j, 0)] = {(S, "")}

                        if tau > 0 or (tau == 0 and i < j):
                            X_type = (i, -tau)
                            Y_type = (j, 0)
                        else:
                            X_type = (j, 0)
                            Y_type = (i, 0)

                        for s in S:
                            sepset_answers[(X_type, s, Y_type)] = False

                        for k, tau in product(range(N), range(0, tau_max + 1)):
                            if sepset_answers.get(
                                (X_type, (k, -tau), Y_type)) is None:
                                sepset_answers[(X_type, (k, -tau),
                                                Y_type)] = True

    if verbosity > 0:
        print("2. FCI orientation rules")

    # Initialize SVARFCI with dummy data
    svarfci = SVARFCI(dataframe=pp.DataFrame(np.zeros((N + 1, N))),
                      cond_ind_test=cond_ind_test)
    svarfci._initialize(tau_max=tau_max,
                        pc_alpha=0.01,
                        max_cond_px=np.inf,
                        max_p_global=np.inf,
                        max_p_dsep=np.inf,
                        max_q_global=np.inf,
                        max_pds_set=np.inf,
                        fix_all_edges_before_final_orientation=False,
                        verbosity=verbosity)
    svarfci._oracle = True

    # Update graph_dict and sepsets
    svarfci.graph_dict = graph_dict
    svarfci.sepsets = sepsets

    # Run *all* rules
    svarfci._B_not_in_SepSet_AC_given_answers = sepset_answers
    svarfci._run_fci_orientation_phase()

    # Also return array version of pag graph
    pag_graph = svarfci._dict2graph()

    return svarfci.graph_dict, pag_graph
Esempio n. 18
0
                fulldata, time_bin_length=time_bin_length)[0]
            fulldata_mask = pp.time_bin_with_mask(
                fulldata_mask, time_bin_length=time_bin_length)[0] > 0.
            print("Fulldata after binning shape = %s" % str(fulldata.shape))
            print("Fulldata after binning masked shape = %s" %
                  str(fulldata_mask.shape))

            # # Only use selected indices
            # selected_comps_indices=[]
            # for i in selected_components:
            #     selected_comps_indices.append(int(comps_order_file['comps'][i]))
            #
            # fulldata = fulldata[:, selected_comps_indices]
            # fulldata_mask = fulldata_mask[:, selected_comps_indices]

            dataframe = pp.DataFrame(fulldata, mask=fulldata_mask)

            print("Fulldata shape = %s" % str(dataframe.values.shape))
            print("Unmasked samples %d" %
                  (dataframe.mask[:, 0] == False).sum())

            # sys.exit(0)
            T, N = dataframe.values.shape
            #print(N)

            resdict = {
                "CI_params": {
                    'significance': 'analytic',
                    'use_mask': True,
                    'mask_type': ['y'],
                    'recycle_residuals': False,
Esempio n. 19
0
def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag):

    T = T_data
    N = N_data
    tau_max = maxlag

    # Verbosity:
    # 0 - nothing
    # 1 - final graph only
    # 2 - everything
    verbose_max = 2
    verbose = 2
    print("======")
    # print(list(data))  # got 100 records as itertools.chain object, not numpy df

    data = np.array(list(data))
    print("data len is ")
    print(len(data))
    # data = np.fromiter(data, float)
    # print(data)
    # Initialize dataframe object, specify time axis and variable names
    dataframe = pp.DataFrame(data, datatime=dt, var_names=headers)
    print(dataframe.var_names)
    rcot = RCOT(significance='analytic')
    pcmci_rcot = PCMCI(dataframe=dataframe, cond_ind_test=rcot, verbosity=0)

    pcmci_rcot.verbosity = 1
    results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05)

    # Print results
    print("p-values")
    print(results['p_matrix'].round(3))
    print("MCI partial correlations")
    print(results['val_matrix'].round(2))

    # print("inside def pcmci_causality")

    # output edges
    result_arr = []
    # result_arr.append(["effect","cause"])

    for index_cause, item in enumerate(results['p_matrix']):
        print("index is")
        print(index)
        print("item is")
        print(item)
        print("cause is")
        cause = headers[index_cause]
        print(headers[index_cause])
        for index_effect, arr in enumerate(item):
            print("effect arr is ")
            print(arr)
            print("effect name is")
            effect = headers[index_effect]
            print(headers[index_effect])
            for arrItem in arr:
                if arrItem < 0.05 and cause != effect:
                    result_arr.append([effect, cause, index])
                    print("{} caused by {}".format(effect, cause))
                    break

        with open("pcmci_para_out{}.csv".format(index), "w", newline='') as f:
            for row in result_arr:
                f.write("%s\n" % ','.join(str(col) for col in row))
    # print(pcmci)
    return result_arr
Esempio n. 20
0
numpy.random.seed(42)  # Fix random seed
links_coeffs = {
    0: [((0, -1), 0.7)],
    1: [((1, -1), 0.8), ((0, -1), 0.8)],
    2: [((2, -1), 0.5), ((1, -2), 0.5)],
}

T = 500  # time series length
data, true_parents_neighbors = pp.var_process(links_coeffs, T=T)
T, N = data.shape

# Optionally specify variable names
var_names = [r'$X^0$', r'$X^1$', r'$X^2$', r'$X^3$']

# Initialize dataframe object
dataframe = pp.DataFrame(data, var_names=var_names)

# Significance level in condition-selection step. If a list of levels is is
# provided or pc_alpha=None, the optimal pc_alpha is automatically chosen via
# model-selection.
pc_alpha = 0.2  # [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
selected_variables = range(N)  #[2] # [2]  # [2]

# Maximum time lag
tau_max = 3

# Optional minimum time lag in MCI step (in PC-step this is 1)
tau_min = 0

# Maximum cardinality of conditions in PC condition-selection step. The
# recommended default choice is None to leave it unrestricted.
def build_link_pcmci_noself(p_data_values, p_agent_names, p_var_sou,
                            p_var_tar):
    """
    build links by n column data
    """
    [times_num, agent_num] = p_data_values.shape
    # set the data for PCMCI
    data_frame = pp.DataFrame(p_data_values,
                              var_names=p_agent_names,
                              missing_flag=BaseConfig.BACKGROUND_VALUE)
    # new PCMCI
    pcmci = PCMCI(dataframe=data_frame, cond_ind_test=ParCorr())
    # run PCMCI
    alpha_level = 0.01
    results_pcmci = pcmci.run_pcmciplus(tau_min=0,
                                        tau_max=2,
                                        pc_alpha=alpha_level)
    # get the result
    graph_pcmci = results_pcmci['graph']
    q_matrix = results_pcmci['q_matrix']
    p_matrix = results_pcmci['p_matrix']
    val_matrix = results_pcmci['val_matrix']
    conf_matrix = results_pcmci['conf_matrix']
    ambiguous_triples = results_pcmci['ambiguous_triples']
    # filter these links
    links_df = pd.DataFrame(columns=('VarSou', 'VarTar', 'Source', 'Target',
                                     'TimeLag', 'Strength', 'Unoriented'))
    if graph_pcmci is not None:
        sig_links = (graph_pcmci != "") * (graph_pcmci != "<--")
    elif q_matrix is not None:
        sig_links = (q_matrix <= alpha_level)
    else:
        sig_links = (p_matrix <= alpha_level)
    for j in range(agent_num):
        links = {(p[0], -p[1]): np.abs(val_matrix[p[0], j, abs(p[1])])
                 for p in zip(*np.where(sig_links[:, j, :]))}
        # Sort by value
        sorted_links = sorted(links, key=links.get, reverse=True)
        for p in sorted_links:
            VarSou = p_var_sou
            VarTar = p_var_tar
            Source = p_agent_names[j]
            Target = p_agent_names[p[0]]
            TimeLag = p[1]
            Strength = val_matrix[p[0], j, abs(p[1])]
            Unoriented = None
            if graph_pcmci is not None:
                if p[1] == 0 and graph_pcmci[j, p[0], 0] == "o-o":
                    Unoriented = 1
                    # "unoriented link"
                elif graph_pcmci[p[0], j, abs(p[1])] == "x-x":
                    Unoriented = 1
                    # "unclear orientation due to conflict"
                else:
                    Unoriented = 0
            links_df = links_df.append(pd.DataFrame({
                'VarSou': [VarSou],
                'VarTar': [VarTar],
                'Source': [Source],
                'Target': [Target],
                'TimeLag': [TimeLag],
                'Strength': [Strength],
                'Unoriented': [Unoriented]
            }),
                                       ignore_index=True)
    # remove the self correlation edges
    links_df = links_df.loc[links_df['Source'] != links_df['Target']]
    return links_df
senegal_millet_file = 'pricedata/SenegalGEIWSMillet.csv'
millet_prices = GEIWS_prices(senegal_millet_file)
millet_prices = subtract_rolling_mean( adjust_seasonality(millet_prices ) )



study_data = millet_prices

# give custom NAN value for tigramite to interpret
mssng = 99999
study_data = study_data.copy().fillna(mssng)



    
dataframe = pp.DataFrame(study_data.values, var_names= study_data.columns, missing_flag = mssng)
tp.plot_timeseries(dataframe)
parcorr = ParCorr(significance='analytic')

gpdc = GPDC(significance='analytic', gp_params=None)

pcmci_gpdc = PCMCI(
    dataframe=dataframe, 
    cond_ind_test=gpdc,
    verbosity=0)

pcmci = PCMCI(
    dataframe=dataframe, 
    cond_ind_test=parcorr,
    verbosity=1)
#
Esempio n. 23
0
def init_pcmci(df_data,
               significance='analytic',
               mask_type='y',
               selected_variables=None,
               verbosity=5):
    '''
    First initializing pcmci object for each training set. This allows to plot
    lagged cross-correlations which help to identity a reasonably tau_max.

    Parameters
    ----------
    df_data : pandas DataFrame
        df_data is retrieved by running rg.get_ts_prec().
    significance : str, optional
        DESCRIPTION. The default is 'analytic'.
    mask_type : str, optional
        DESCRIPTION. The default is 'y'.
    verbosity : int, optional
        DESCRIPTION. The default is 4.
    selected_variables : list of integers, optional (default: None)
        Specify to estimate parents only for selected variables. If None is
        passed, parents are estimated for all variables.

    Returns
    -------
    dictionary of format {split:pcmci}.

    '''
    splits = df_data.index.levels[0]
    pcmci_dict = {}
    RV_mask = df_data['RV_mask']
    for s in range(splits.size):

        TrainIsTrue = df_data['TrainIsTrue'].loc[s]
        df_data_s = df_data.loc[s][TrainIsTrue == True]
        df_data_s = df_data_s.dropna(axis=1, how='all')
        if any(df_data_s.isna().values.flatten()):
            if verbosity > 0:
                print('Warnning: nans detected')


#        print(np.unique(df_data_s.isna().values))
        var_names = [
            k for k in df_data_s.columns
            if k not in ['TrainIsTrue', 'RV_mask']
        ]
        df_data_s = df_data_s.loc[:, var_names]
        data = df_data_s.values
        data_mask = ~RV_mask.loc[s][TrainIsTrue == True].values
        # indices with mask == False are used (with mask_type 'y')
        data_mask = np.repeat(data_mask, data.shape[1]).reshape(data.shape)

        # create dataframe in Tigramite format
        dataframe = pp.DataFrame(data=data,
                                 mask=data_mask,
                                 var_names=var_names)

        parcorr = ParCorr(significance=significance,
                          mask_type=mask_type,
                          verbosity=0)
        parcorr.verbosity = verbosity  # to avoid print init text each time

        # ======================================================================================================================
        # pc algorithm: only parents for selected_variables are calculated
        # ======================================================================================================================
        pcmci = PCMCI(dataframe=dataframe,
                      cond_ind_test=parcorr,
                      verbosity=verbosity)
        pcmci_dict[s] = pcmci
    return pcmci_dict
Esempio n. 24
0
 def run(self):
     data, _ = pp.var_process(self.links_coeffs, T=1000)
     dataframe = pp.DataFrame(data)
     cond_ind_test = ParCorr()
     self.pcmciobj = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test)
     self.results = self.pcmciobj.run_pcmci(tau_max=2, pc_alpha=None)
        # ======================================================================================================================
        # new mask
        # ======================================================================================================================
        print data.shape

        data_mask = np.ones(data.shape, dtype='bool')
        for i in range(4):  # take into account 4 months starting from june=5
            data_mask[5 + i::12, :] = False  # [22+i:: 52,:]
        ##
        T, N = data.shape

        # ======================================================================================================================
        # Initialize dataframe object (needed for tigramite functions)
        # ======================================================================================================================
        dataframe = pp.DataFrame(data=data, mask=data_mask)

        # Specify time axis and variable names
        datatime = np.arange(len(data))

        # ======================================================================================================================
        # pc algorithm: only parents for selected_variables are calculated (here entry[0] = PoV)
        # ======================================================================================================================

        parcorr = ParCorr(significance='analytic',
                          use_mask=True,
                          mask_type='y',
                          verbosity=2)
        pcmci = PCMCI(
            dataframe=dataframe,
            cond_ind_test=parcorr,
    def plot__gpdc_get_single_residuals(self):

        #######
        ci_test = self.ci_gpdc
        # ci_test = self.ci_par_corr

        a = 0.
        c = .3
        T = 500
        # Each key refers to a variable and the incoming links are supplied as a
        # list of format [((driver, lag), coeff), ...]
        links_coeffs = {
            0: [((0, -1), a)],
            1: [((1, -1), a), ((0, -1), c)],
        }

        numpy.random.seed(42)
        data, true_parents_neighbors = pp.var_process(links_coeffs,
                                                      use='inv_inno_cov',
                                                      T=T)
        dataframe = pp.DataFrame(data)
        ci_test.set_dataframe(dataframe)

        # ci_test.set_tau_max(1)

        # X=[(1, -1)]
        # Y=[(1, 0)]
        # Z=[(0, -1)] + [(1, -tau) for tau in range(1, 2)]
        # array, xyz, XYZ = ci_test.get_array(X, Y, Z,
        #     verbosity=0)]
        # ci_test.run_test(X, Y, Z,)
        def func(x):
            return x * (1. - 4. * x**0 * numpy.exp(-x**2 / 2.))

        true_residual = numpy.random.randn(3, T)
        array = numpy.copy(true_residual)
        array[1] += c * func(array[2])  #.sum(axis=0)
        xyz = numpy.array([0, 1] + [2 for i in range(array.shape[0] - 2)])

        print 'xyz ', xyz, numpy.where(xyz == 1)
        target_var = 1

        dim, T = array.shape
        # array -= array.mean(axis=1).reshape(dim, 1)
        c_std = c  #/array[1].std()
        # array /= array.std(axis=1).reshape(dim, 1)
        array_orig = numpy.copy(array)

        import matplotlib
        from matplotlib import pyplot
        (est_residual, pred) = ci_test._get_single_residuals(array,
                                                             target_var,
                                                             standardize=False,
                                                             return_means=True)
        (resid_, pred_parcorr) = self.ci_par_corr._get_single_residuals(
            array, target_var, standardize=False, return_means=True)

        fig = pyplot.figure()
        ax = fig.add_subplot(111)
        ax.scatter(array_orig[2], array_orig[1])
        ax.scatter(array_orig[2], pred, color='red')
        ax.scatter(array_orig[2], pred_parcorr, color='green')
        ax.plot(numpy.sort(array_orig[2]),
                c_std * func(numpy.sort(array_orig[2])),
                color='black')

        pyplot.savefig('/home/jakobrunge/test/gpdctest.pdf')
Esempio n. 27
0
    m_y_indices = [
        m_y_data.columns.get_loc('Month'),
        m_y_data.columns.get_loc('Year')
    ]
    m_y_data = m_y_data.interpolate(
        method='linear',
        limit=inter_max_gap) if interpolate == True else m_y_data
    data_filled = m_y_data.fillna(mssng)

else:
    pass

data_filled = data_filled[study_vars] if use_study_vars == True else data_filled
T, N = data_filled.shape
dataframe = pp.DataFrame(data_filled.values,
                         var_names=data_filled.columns,
                         missing_flag=mssng)

#links to study
selected_links = {
    i: list(
        chain.from_iterable([[(j, k)
                              for k in range(-tau_max, -steps_ahead + 1)]
                             for j in range(N) if j != target]))
    for i in range(N)
}
#    remove month and year conditions

#data_stationary_filled = data_stationary.fillna(mssng)
#data_filled = data_filled.dropna()
#data_stationary_filled = data_stationary_filled[study_vars] if use_study_vars == True else data_stationary_filled
Esempio n. 28
0
def test_construct_array(cstrct_array_params):
    # Unpack the parameters
    (x_nds, y_nds, z_nds), tau_max, missing_vals, mask_type =\
        cstrct_array_params
    # Make some fake data
    data = np.arange(1000).reshape(10, 100).T
    # Get the needed parameters from the data
    T, N = data.shape
    max_lag = 2 * tau_max
    n_times = T - max_lag

    # When testing masking and missing value flags, we will remove time slices,
    # starting with the earliest slice.  This counter keeps track of how many
    # rows have been masked.
    n_rows_masked = 0

    # Make a fake mask
    data_mask = np.zeros_like(data, dtype='bool')
    if mask_type is not None:
        for var, nodes in zip(['x', 'y', 'z'], [x_nds, y_nds, z_nds]):
            if var in mask_type:
                # Get the first node
                a_nd, a_tau = nodes[0]
                # Mask the first value of this node
                data_mask[a_tau - n_times + n_rows_masked, a_nd] = True
                n_rows_masked += 1

    # Choose fake missing value as the earliest time entry in the first z-node
    # from the original (non-shifted) datathat is not cutoff by max_lag or
    # masked values from the first z-node
    missing_flag = None
    if missing_vals:
        # Get the node index
        a_nd, _ = z_nds[0]
        # Select the earliest non-cutoff entry from the unshifted data set
        earliest_time = max_lag + n_rows_masked
        missing_flag = data[earliest_time, a_nd]
        # Record that the row with this value and all rows up to max_lag after
        # this value have been cut off as well
        n_rows_masked += max_lag + 1

    # Construct the array
    data_f = pp.DataFrame(data, data_mask, missing_flag)
    array, xyz = data_f.construct_array(x_nds,
                                        y_nds,
                                        z_nds,
                                        tau_max=tau_max,
                                        mask_type=mask_type,
                                        verbosity=VERBOSITY)
    # Ensure x_nds, y_nds, z_ndes are unique
    x_nds = list(OrderedDict.fromkeys(x_nds))
    y_nds = list(OrderedDict.fromkeys(y_nds))
    z_nds = list(OrderedDict.fromkeys(z_nds))
    z_nds = [
        node for node in z_nds if (node not in x_nds) and (node not in y_nds)
    ]

    # Get the expected results
    expect_array = np.array([
        list(
            range(data[time - n_times, node],
                  data[time - n_times, node] + n_times))
        for node, time in x_nds + y_nds + z_nds
    ])
    expect_xyz = np.array([0 for _ in x_nds] +\
                          [1 for _ in y_nds] +\
                          [2 for _ in z_nds])
    # Apply the mask, which always blocks the latest time of the 0th node of the
    # masked variable, which removes the first n time slices in the returned
    # array
    expect_array = expect_array[:, n_rows_masked:]
    # Test the results
    np.testing.assert_almost_equal(array, expect_array)
    np.testing.assert_almost_equal(xyz, expect_xyz)
Esempio n. 29
0
    with open("test.csv", "w", newline="") as csvfile:
        data_writer = csv.writer(csvfile,
                                 delimiter=delimiter,
                                 quotechar=quotechar)
        for line in data:
            #data_writer.writerow("|".join([str(s) for s in line]))
            data_writer.writerow(line)

    exit()

T, N = data.shape

# Initialize dataframe object, specify time axis and variable names
#var_names = [r'$X^0$', r'$X^1$', r'$X^2$', r'$X^3$']
dataframe = pp.DataFrame(data,
                         datatime=np.arange(len(data)),
                         var_names=headers)

if verbose > 0:
    plot = tp.plot_timeseries(dataframe)[0]
    if display_images:
        plot.show()
    if save_images:
        plot.savefig("timeseries.png")

parcorr = ParCorr(significance='analytic')
pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=1)

correlations = pcmci.get_lagged_dependencies(tau_max=3)
lag_func_matrix = tp.plot_lagfuncs(val_matrix=correlations,
                                   setup_args={
Esempio n. 30
0
def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag):
    T = T_data
    N = N_data
    # Run settings
    # there is another tau_max in lagged dependencies that might be much longer!
    tau_max = maxlag

    # Verbosity:
    # 0 - nothing
    # 1 - final graph only
    # 2 - everything
    verbose_max = 2
    verbose = 2
    print("======")
    # print(list(data))  # got 100 records as itertools.chain object, not numpy df

    # Initialize dataframe object, specify time axis and variable names
    dataframe = pp.DataFrame(data, datatime=dt, var_names=headers)
    print(dataframe.var_names)
    rcot = RCOT(significance='analytic')
    pcmci_rcot = PCMCI(dataframe=dataframe, cond_ind_test=rcot, verbosity=0)

    pcmci_rcot.verbosity = 1
    results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05)

    # Print results
    print("p-values")
    print(results['p_matrix'].round(3))
    print("MCI partial correlations")
    print(results['val_matrix'].round(2))

    # Save results to file
    # p_matrix = results['p_matrix']
    # with open("p-values_baseline.csv", "w") as csv_file:
    #     writer = csv.writer(csv_file, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL)
    #     # [[[1 2 3]]] Three brackets to get through.
    #     for sector in p_matrix:
    #         print("sector: ", sector)
    #         for row in sector:
    #             print("row: ", row)
    #             writer.writerow(row)
    #         writer.writerow([])
    #
    # print("inside def pcmci_causality")

    # output edges
    result_arr = []

    for index_cause, item in enumerate(results['p_matrix']):
        # print("index is")
        # print(index)
        # print("item is")
        # print(item)
        # print("cause is")
        cause = headers[index_cause]
        # print(headers[index_cause])
        for index_effect, arr in enumerate(item):
            # print("effect arr is ")
            # print(arr)
            # print("effect name is")
            effect = headers[index_effect]
            # print(headers[index_effect])
            for arrItem in arr:
                if arrItem < 0.05 and cause != effect:
                    result_arr.append([effect, cause, index])
                    print("{} caused by {}".format(effect, cause))
                    break

    with open("pcmci_baseline_out.csv", "w", newline='') as f:
        for row in result_arr:
            f.write("%s\n" % ','.join(str(col) for col in row))
    # print(pcmci)
    print(result_arr)

    return result_arr