def test_cmi_knn(self):

        ci_cmi_knn = CMIknn(use_mask=False,
                            mask_type=None,
                            significance='shuffle_test',
                            fixed_thres=None,
                            sig_samples=10000,
                            sig_blocklength=3,
                            knn=10,
                            confidence='bootstrap',
                            conf_lev=0.9,
                            conf_samples=10000,
                            conf_blocklength=1,
                            verbosity=0)

        # ci_cmi_knn._trafo2uniform(self, x)

        val_ana = 0.6
        T = 10000
        numpy.random.seed(42)
        array = numpy.random.randn(5, T)

        cov = numpy.array([[1., val_ana], [val_ana, 1.]])
        array[:2, :] = numpy.random.multivariate_normal(mean=numpy.zeros(2),
                                                        cov=cov,
                                                        size=T).T

        # Generate some confounding
        if len(array) > 2:
            array[0] += 0.5 * array[2:].sum(axis=0)
            array[1] += 0.7 * array[2:].sum(axis=0)

        # print numpy.corrcoef(array)[0,1]
        # print val
        dim, T = array.shape
        xyz = numpy.array([0, 1, 2, 2, 2])

        val_est = ci_cmi_knn.get_dependence_measure(array, xyz)

        print val_est
        print _par_corr_to_cmi(val_ana)

        numpy.testing.assert_allclose(numpy.array(_par_corr_to_cmi(val_ana)),
                                      numpy.array(val_est),
                                      atol=0.02)
def cmi_knn(request):
    return CMIknn(mask_type=None,
                  significance='shuffle_test',
                  fixed_thres=None,
                  sig_samples=10000,
                  sig_blocklength=3,
                  knn=10,
                  confidence='bootstrap',
                  conf_lev=0.9,
                  conf_samples=10000,
                  conf_blocklength=1,
                  verbosity=0)
Beispiel #3
0
def issue38():
    dpath = os.path.dirname(os.path.abspath(__file__))
    fname = 'tigramite_issue_38_input_example.csv'
    fpath = os.path.join(dpath, fname)
    df = pd.read_csv(fpath, index_col=0)
    print(df)
    data = df.values

    tdf = DataFrame(
        data=data,
        mask=None,
        missing_flag=None,
        var_names=df.columns,
        datatime=None,
    )

    indp_test = CMIknn(
        # knn=None,
        # shuffle_neighbors=None,
        # transform=None,
        # significance=None,
    )

    selected_variables = [col_lbl for col_lbl in df.columns if 'i' in col_lbl]
    selected_variables_ix = [
        df.columns.get_loc(lbl) for lbl in selected_variables
    ]
    print(("Init PCMCI with:"
           f"dataframe={tdf},"
           f"cond_ind_test={indp_test},"
           f"selected_variables={selected_variables_ix},"
           f"verbosity=10,"))
    pcmci = PCMCI(
        dataframe=tdf,
        cond_ind_test=indp_test,
        selected_variables=selected_variables_ix,
        verbosity=10,
    )

    max_lag = 24
    alpha = 0.1

    print("Running PCMCI...")
    pcmci.run_pcmci(tau_max=max_lag, pc_alpha=alpha)

    print("Done successfully! No errors!")
def calculate(para_setup):

    para_setup_string, sam = para_setup

    paras = para_setup_string.split('-')
    paras = [w.replace("'", "") for w in paras]

    model = str(paras[0])
    N = int(paras[1])
    n_links = int(paras[2])
    min_coeff = float(paras[3])
    coeff = float(paras[4])
    auto = float(paras[5])
    contemp_fraction = float(paras[6])
    frac_unobserved = float(paras[7])
    max_true_lag = int(paras[8])
    T = int(paras[9])

    ci_test = str(paras[10])
    method = str(paras[11])
    pc_alpha = float(paras[12])
    tau_max = int(paras[13])

    #############################################
    ##  Data
    #############################################

    def lin_f(x):
        return x

    def f2(x):
        return (x + 5. * x**2 * np.exp(-x**2 / 20.))

    if model == 'autobidirected':
        if verbosity > 999:
            model_seed = verbosity - 1000
        else:
            model_seed = sam

        random_state = np.random.RandomState(model_seed)

        links = {
            0: [((0, -1), auto, lin_f), ((1, -1), coeff, lin_f)],
            1: [],
            2: [((2, -1), auto, lin_f), ((1, -1), coeff, lin_f)],
            3: [((3, -1), auto, lin_f), ((2, -1), min_coeff, lin_f)],
        }
        observed_vars = [0, 2, 3]

        noises = [random_state.randn for j in range(len(links))]

        data_all, nonstationary = mod.generate_nonlinear_contemp_timeseries(
            links=links, T=T, noises=noises, random_state=random_state)
        data = data_all[:, observed_vars]

    elif 'random' in model:
        if 'lineargaussian' in model:

            coupling_funcs = [lin_f]

            noise_types = ['gaussian']  #, 'weibull', 'uniform']
            noise_sigma = (0.5, 2)

        elif 'nonlinearmixed' in model:

            coupling_funcs = [lin_f, f2]

            noise_types = ['gaussian', 'gaussian', 'weibull']
            noise_sigma = (0.5, 2)

        if coeff < min_coeff:
            min_coeff = coeff
        couplings = list(np.arange(min_coeff, coeff + 0.1, 0.1))
        couplings += [-c for c in couplings]

        auto_deps = list(np.arange(max(0., auto - 0.3), auto + 0.01, 0.05))

        # Models may be non-stationary. Hence, we iterate over a number of seeds
        # to find a stationary one regarding network topology, noises, etc
        if verbosity > 999:
            model_seed = verbosity - 1000
        else:
            model_seed = sam

        for ir in range(1000):
            # np.random.seed(model_seed)
            random_state = np.random.RandomState(model_seed)

            N_all = math.floor((N / (1. - frac_unobserved)))
            n_links_all = math.ceil(n_links / N * N_all)
            observed_vars = np.sort(
                random_state.choice(range(N_all),
                                    size=math.ceil(
                                        (1. - frac_unobserved) * N_all),
                                    replace=False)).tolist()

            links = mod.generate_random_contemp_model(
                N=N_all,
                L=n_links_all,
                coupling_coeffs=couplings,
                coupling_funcs=coupling_funcs,
                auto_coeffs=auto_deps,
                tau_max=max_true_lag,
                contemp_fraction=contemp_fraction,
                # num_trials=1000,
                random_state=random_state)

            class noise_model:
                def __init__(self, sigma=1):
                    self.sigma = sigma

                def gaussian(self, T):
                    # Get zero-mean unit variance gaussian distribution
                    return self.sigma * random_state.randn(T)

                def weibull(self, T):
                    # Get zero-mean sigma variance weibull distribution
                    a = 2
                    mean = scipy.special.gamma(1. / a + 1)
                    variance = scipy.special.gamma(
                        2. / a + 1) - scipy.special.gamma(1. / a + 1)**2
                    return self.sigma * (random_state.weibull(a=a, size=T) -
                                         mean) / np.sqrt(variance)

                def uniform(self, T):
                    # Get zero-mean sigma variance uniform distribution
                    mean = 0.5
                    variance = 1. / 12.
                    return self.sigma * (random_state.uniform(size=T) -
                                         mean) / np.sqrt(variance)

            noises = []
            for j in links:
                noise_type = random_state.choice(noise_types)
                sigma = noise_sigma[0] + (
                    noise_sigma[1] - noise_sigma[0]) * random_state.rand()
                noises.append(getattr(noise_model(sigma), noise_type))

            if 'discretebinom' in model:
                if 'binom2' in model:
                    n_binom = 2
                elif 'binom4' in model:
                    n_binom = 4

                data_all_check, nonstationary = discretized_scp(
                    links=links,
                    T=T + 10000,
                    n_binom=n_binom,
                    random_state=random_state)
            else:
                data_all_check, nonstationary = mod.generate_nonlinear_contemp_timeseries(
                    links=links,
                    T=T + 10000,
                    noises=noises,
                    random_state=random_state)

            # If the model is stationary, break the loop
            if not nonstationary:
                data_all = data_all_check[:T]
                data = data_all[:, observed_vars]
                break
            else:
                print("Trial %d: Not a stationary model" % ir)
                model_seed += 10000
    else:
        raise ValueError("model %s not known" % model)

    if nonstationary:
        raise ValueError("No stationary model found: %s" % model)

    true_graph = utilities._get_pag_from_dag(links,
                                             observed_vars=observed_vars,
                                             tau_max=tau_max,
                                             verbosity=verbosity)[1]

    if verbosity > 0:
        print("True Links")
        for j in links:
            print(j, links[j])
        print("observed_vars = ", observed_vars)
        print("True PAG")
        if tau_max > 0:
            for lag in range(tau_max + 1):
                print(true_graph[:, :, lag])
        else:
            print(true_graph.squeeze())

    if plot_data:
        print("PLOTTING")
        for j in range(N):
            # ax = fig.add_subplot(N,1,j+1)
            pyplot.plot(data[:, j])

        pyplot.show()

    computation_time_start = time.time()

    dataframe = pp.DataFrame(data)

    #############################################
    ##  Methods
    #############################################

    # Specify conditional independence test object
    if ci_test == 'par_corr':
        cond_ind_test = ParCorr(significance='analytic',
                                recycle_residuals=True)
    elif ci_test == 'cmi_knn':
        cond_ind_test = CMIknn(knn=0.1, sig_samples=500, sig_blocklength=1)
    elif ci_test == 'gp_dc':
        cond_ind_test = GPDC(recycle_residuals=True)
    elif ci_test == 'discg2':
        cond_ind_test = DiscG2()
    else:
        raise ValueError("CI test not recognized.")

    if 'lpcmci' in method:
        method_paras = method.split('_')
        n_preliminary_iterations = int(method_paras[1][7:])

        if 'prelimonly' in method: prelim_only = True
        else: prelim_only = False

        lpcmci = LPCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test)

        lpcmcires = lpcmci.run_lpcmci(
            tau_max=tau_max,
            pc_alpha=pc_alpha,
            max_p_non_ancestral=3,
            n_preliminary_iterations=n_preliminary_iterations,
            prelim_only=prelim_only,
            verbosity=verbosity)

        graph = lpcmci.graph
        val_min = lpcmci.val_min_matrix
        max_cardinality = lpcmci.cardinality_matrix

    elif method == 'svarfci':
        svarfci = SVARFCI(dataframe=dataframe, cond_ind_test=cond_ind_test)
        svarfcires = svarfci.run_svarfci(
            tau_max=tau_max,
            pc_alpha=pc_alpha,
            max_cond_px=0,
            max_p_dsep=3,
            fix_all_edges_before_final_orientation=True,
            verbosity=verbosity)

        graph = svarfci.graph
        val_min = svarfci.val_min_matrix
        max_cardinality = svarfci.cardinality_matrix

    elif method == 'svarrfci':
        svarrfci = SVARRFCI(dataframe=dataframe, cond_ind_test=cond_ind_test)

        svarrfcires = svarrfci.run_svarrfci(
            tau_max=tau_max,
            pc_alpha=pc_alpha,
            fix_all_edges_before_final_orientation=True,
            verbosity=verbosity)

        graph = svarrfci.graph
        val_min = svarrfci.val_min_matrix
        max_cardinality = svarrfci.cardinality_matrix
    else:
        raise ValueError("%s not implemented." % method)

    computation_time_end = time.time()
    computation_time = computation_time_end - computation_time_start

    return {
        'true_graph': true_graph,
        'val_min': val_min,
        'max_cardinality': max_cardinality,

        # Method results
        'computation_time': computation_time,
        'graph': graph,
    }
def test(dataframes,max_lags=[4],alpha=[None],tests=['ParCorr'],limit=1):
    ''' This function performs the PCMCI algorithm for all the dataframes received as parameters, given the hyper-parameters of the conditional
        independence test
    Args:
        dataframes: A list of TIGRAMITE dataframes
        max_lags: Maximum number of lags to consider for the laggd time series
        alpha: Significance level to perform the parent test
        tests: A list of conditional independence test to be performed
        limit: A limit for the instances to be considered

    Returns:

    '''
    test_results = []
    random.shuffle(dataframes)
    total = limit*len(max_lags)*len(alpha)*len(tests)
    data_frame_iter = iter(dataframes)

    tests_to_evaluate=[]
    if 'RCOT' in tests:
        rcot = RCOT()
        tests_to_evaluate.append(['RCOT',rcot])
    if 'GPDC' in tests:
        gpdc = GPDC()
        tests_to_evaluate.append(['GPDC', gpdc])
    if 'ParCorr' in tests:
        parcorr = ParCorr(significance='analytic')
        tests_to_evaluate.append(['ParCorr',parcorr])
    if 'CMIknn' in tests:
        cmiknn = CMIknn()
        tests_to_evaluate.append(['CMIknn',cmiknn])


    unique_complexities = list(set(l[1] for l in dataframes))
    counts = {}
    for i in unique_complexities:
        counts[i] = 0

    for test in tests_to_evaluate:
        stop = False
        for l in max_lags:
            for a in alpha:
                while not stop:
                    try:
                        i = random.sample(dataframes,1)[0]
                        if counts[i[1]] < limit:
                            print('evaluating: ' + str(i[3]))
                            start = time.time()
                            pcmci = PCMCI(
                                    dataframe=i[2],
                                    cond_ind_test=test[1],
                                    verbosity=0)
                             # correlations = pcmci.get_lagged_dependencies(tau_max=20)
                            pcmci.verbosity = 1
                            results = pcmci.run_pcmci(tau_max=l, pc_alpha=a)
                            time_lapse = round(time.time() - start, 2)

                            q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh')
                            valid_parents = list(pcmci.return_significant_parents(pq_matrix=q_matrix,
                                                                                  val_matrix=results['val_matrix'],
                                                                                  alpha_level=a)['parents'].values())

                            flat_list = []
                            for sublist in valid_parents:
                                for item in sublist:
                                    flat_list.append(item)

                            valid_links = len(flat_list)

                            test_results.append([i[3], i[0], i[1], l,test[0],a,valid_links,time_lapse])

                            results_df = pd.DataFrame(test_results,
                                                              columns=['representation', 'complexity', 'sample_size', 'max_lag','test','alpha','valid_links_at_alpha',
                                                                       'learning_time'])
                            print('results ready to be saved')
                            results_df.to_csv(
                                        'results/performance_sample_sizes.csv',
                                        index=False)

                            counts[i[1]] += 1
                            if all(value == limit for value in counts.values()):
                                stop = True

                    except:
                        print('Hoopla!')
                        pass

                for i in unique_complexities:
                    counts[i] = 0