Esempio n. 1
0
val_matrix = results['val_matrix']
with open("val-values.csv", "w") as csv_file:
    writer = csv.writer(csv_file,
                        delimiter=",",
                        quotechar="|",
                        quoting=csv.QUOTE_MINIMAL)
    #[[[1 2 3]]] Three brackets to get through.
    for sector in val_matrix:
        print("sector: ", sector)
        for row in sector:
            print("row: ", row)
            writer.writerow(row)
        writer.writerow([])

q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'],
                                       fdr_method='fdr_bh')
pcmci.print_significant_links(p_matrix=results['p_matrix'],
                              q_matrix=q_matrix,
                              val_matrix=results['val_matrix'],
                              alpha_level=0.01)

link_matrix = pcmci.return_significant_parents(
    pq_matrix=q_matrix, val_matrix=results['val_matrix'],
    alpha_level=0.01)['link_matrix']

graph = tp.plot_graph(
    val_matrix=results['val_matrix'],
    link_matrix=link_matrix,
    var_names=headers,
    link_colorbar_label='cross-MCI',
    node_colorbar_label='auto-MCI',
Esempio n. 2
0
def run_PCMCI(ex, outdic_actors, s, df_splits, map_proj):
    #=====================================================================================
    #
    # 4) PCMCI-algorithm
    #
    #=====================================================================================

    # save output
    if ex['SaveTF'] == True:
        #        from contextlib import redirect_stdout
        orig_stdout = sys.stdout
        # buffer print statement output to f
        if sys.version[:1] == '3':
            sys.stdout = f = io.StringIO()
        elif sys.version[:1] == '2':
            sys.stdout = f = open(os.path.join(ex['fig_subpath'], 'old.txt'),
                                  'w+')

#%%
# amount of text printed:
    verbosity = 3

    # alpha level for independence test within the pc procedure (finding parents)
    pc_alpha = ex['pcA_sets'][ex['pcA_set']]
    # alpha level for multiple linear regression model while conditining on parents of
    # parents
    alpha_level = ex['alpha_level_tig']
    print('run tigramite 4, run.pcmci')
    print(('alpha level(s) for independence tests within the pc procedure'
           '(finding parents): {}'.format(pc_alpha)))
    print((
        'alpha level for multiple linear regression model while conditining on parents of '
        'parents: {}'.format(ex['alpha_level_tig'])))

    # Retrieve traintest info
    traintest = df_splits

    # load Response Variable class
    RV = ex[ex['RV_name']]
    # create list with all actors, these will be merged into the fulldata array
    allvar = ex['vars'][0]
    var_names_corr = []
    actorlist = []
    cols = [[RV.name]]

    for var in allvar[:]:
        print(var)
        actor = outdic_actors[var]
        if actor.ts_corr[s].size != 0:
            ts_train = actor.ts_corr[s].values
            actorlist.append(ts_train)
            # create array which numbers the regions
            var_idx = allvar.index(var)
            n_regions = actor.ts_corr[s].shape[1]
            actor.var_info = [[i + 1, actor.ts_corr[s].columns[i], var_idx]
                              for i in range(n_regions)]
            # Array of corresponing regions with var_names_corr (first entry is RV)
            var_names_corr = var_names_corr + actor.var_info
            cols.append(list(actor.ts_corr[s].columns))
            index_dates = actor.ts_corr[s].index
    var_names_corr.insert(0, RV.name)

    # stack actor time-series together:
    fulldata = np.concatenate(tuple(actorlist), axis=1)

    print(('There are {} regions in total'.format(fulldata.shape[1])))
    # add the full 1D time series of interest as first entry:

    fulldata = np.column_stack((RV.RVfullts, fulldata))
    df_data = pd.DataFrame(fulldata, columns=flatten(cols), index=index_dates)

    if ex['import_prec_ts'] == True:
        var_names_full = var_names_corr.copy()
        for d in ex['precursor_ts']:
            path_data = d[1]
            if len(path_data) > 1:
                path_data = ''.join(list(path_data))
            # skip first col because it is the RV ts
            df_data_ext = func_fc.load_hdf5(
                path_data)['df_data'].iloc[:, 1:].loc[s]
            cols_ts = np.logical_or(df_data_ext.dtypes == 'float64',
                                    df_data_ext.dtypes == 'float32')
            cols_ext = list(df_data_ext.columns[cols_ts])
            # cols_ext must be of format '{}_{int}_{}'
            lab_int = 100
            for i, c in enumerate(cols_ext):
                char = c.split('_')[1]
                if char.isdigit():
                    pass
                else:
                    cols_ext[i] = c.replace(char, str(lab_int)) + char
                    lab_int += 1

            df_data_ext = df_data_ext[cols_ext]
            to_freq = ex['tfreq']
            if to_freq != 1:
                start_end_date = (ex['sstartdate'], ex['senddate'])
                start_end_year = (ex['startyear'], ex['endyear'])
            df_data_ext = functions_pp.time_mean_bins(df_data_ext,
                                                      to_freq,
                                                      start_end_date,
                                                      start_end_year,
                                                      seldays='part')[0]
            #            df_data_ext = functions_pp.time_mean_bins(df_data_ext,
            #                                                     ex, ex['tfreq'],
            #                                                     seldays='part')[0]
            # Expand var_names_corr
            n = var_names_full[-1][0] + 1
            add_n = n + len(cols_ext)
            n_var_idx = var_names_full[-1][-1] + 1
            for i in range(n, add_n):
                var_names_full.append([i, cols_ext[i - n], n_var_idx])
            df_data = df_data.merge(df_data_ext,
                                    left_index=True,
                                    right_index=True)
    else:
        var_names_full = var_names_corr

    bool_train = traintest.loc[s]['TrainIsTrue']
    bool_RV_train = np.logical_and(bool_train, traintest.loc[s]['RV_mask'])
    dates_train = traintest.loc[s]['TrainIsTrue'][bool_train].index
    dates_RV_train = traintest.loc[s]['TrainIsTrue'][bool_RV_train].index

    RVfull_train = RV.RVfullts.sel(time=dates_train)
    datesfull_train = pd.to_datetime(RVfull_train.time.values)
    data = df_data.loc[datesfull_train].values
    print((data.shape))

    # get RV datamask (same shape als data)
    data_mask = [
        True if d in dates_RV_train else False for d in datesfull_train
    ]
    data_mask = np.repeat(data_mask, data.shape[1]).reshape(data.shape)

    # add traintest mask to fulldata
    #    dates_all = pd.to_datetime(RV.RVfullts.index)
    #    dates_RV  = pd.to_datetime(RV.RV_ts.index)
    dates_all = pd.to_datetime(RV.RVfullts.time.values)
    dates_RV = pd.to_datetime(RV.RV_ts.time.values)
    df_data['TrainIsTrue'] = [
        True if d in datesfull_train else False for d in dates_all
    ]
    df_data['RV_mask'] = [True if d in dates_RV else False for d in dates_all]

    # ======================================================================================================================
    # tigramite 3
    # ======================================================================================================================

    T, N = data.shape  # Time, Regions
    # ======================================================================================================================
    # Initialize dataframe object (needed for tigramite functions)
    # ======================================================================================================================
    dataframe = pp.DataFrame(data=data,
                             mask=data_mask,
                             var_names=var_names_full)
    # ======================================================================================================================
    # pc algorithm: only parents for selected_variables are calculated
    # ======================================================================================================================

    parcorr = ParCorr(significance='analytic',
                      mask_type='y',
                      verbosity=verbosity)
    #==========================================================================
    # multiple testing problem:
    #==========================================================================
    pcmci = PCMCI(dataframe=dataframe,
                  cond_ind_test=parcorr,
                  selected_variables=None,
                  verbosity=4)

    # selected_variables : list of integers, optional (default: range(N))
    #    Specify to estimate parents only for selected variables. If None is
    #    passed, parents are estimated for all variables.

    # ======================================================================================================================
    #selected_links = dictionary/None
    results = pcmci.run_pcmci(tau_max=ex['tigr_tau_max'],
                              pc_alpha=pc_alpha,
                              tau_min=0,
                              max_combinations=ex['max_comb_actors'])

    q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'],
                                           fdr_method='fdr_bh')

    pcmci.print_significant_links(p_matrix=results['p_matrix'],
                                  q_matrix=q_matrix,
                                  val_matrix=results['val_matrix'],
                                  alpha_level=alpha_level)

    # returns all parents, not just causal precursors (of lag>0)
    sig = rgcpd.return_sign_parents(pcmci,
                                    pq_matrix=q_matrix,
                                    val_matrix=results['val_matrix'],
                                    alpha_level=alpha_level)

    all_parents = sig['parents']
    #    link_matrix = sig['link_matrix']

    links_RV = all_parents[0]

    df = rgcpd.bookkeeping_precursors(links_RV, var_names_full)
    #%%

    rgcpd.print_particular_region_new(links_RV, var_names_corr, s,
                                      outdic_actors, map_proj, ex)

    #%%
    if ex['SaveTF'] == True:
        if sys.version[:1] == '3':
            fname = f's{s}_' + ex['params'] + '.txt'
            file = io.open(os.path.join(ex['fig_subpath'], fname), mode='w+')
            file.write(f.getvalue())
            file.close()
            f.close()
        elif sys.version[:1] == '2':
            f.close()
        sys.stdout = orig_stdout

    return df, df_data
Esempio n. 3
0
T, N = data.shape

# Initialize dataframe object
dataframe = pp.DataFrame(data)

#%%

rcot = RCOT2(significance=parameters['cond_ind_test.significance'],
            num_f=parameters['cond_ind_test.num_f'])
pcmci = PCMCI(dataframe,
              cond_ind_test=rcot,
              selected_variables=parameters['selected_variables'],
              var_names=parameters['var_names'],
              verbosity=10)
              
q_matrix = pcmci.get_corrected_pvalues(p_matrix=p_matrix, fdr_method='fdr_bh')
q_matrix_tsbh = pcmci.get_corrected_pvalues(p_matrix=p_matrix, fdr_method='fdr_tsbh')

#%% print results
pcmci._print_significant_links(
        p_matrix = p_matrix, 
        q_matrix = q_matrix,
        val_matrix = val_matrix,
        alpha_level = 0.1)


#%% get selected parents and fit linear model
q_0 = 0.05

parameters['q_0'] = q_0
parameters['q_matrix'] = q_matrix
def test(dataframes,max_lags=[4],alpha=[None],tests=['ParCorr'],limit=1):
    ''' This function performs the PCMCI algorithm for all the dataframes received as parameters, given the hyper-parameters of the conditional
        independence test
    Args:
        dataframes: A list of TIGRAMITE dataframes
        max_lags: Maximum number of lags to consider for the laggd time series
        alpha: Significance level to perform the parent test
        tests: A list of conditional independence test to be performed
        limit: A limit for the instances to be considered

    Returns:

    '''
    test_results = []
    random.shuffle(dataframes)
    total = limit*len(max_lags)*len(alpha)*len(tests)
    data_frame_iter = iter(dataframes)

    tests_to_evaluate=[]
    if 'RCOT' in tests:
        rcot = RCOT()
        tests_to_evaluate.append(['RCOT',rcot])
    if 'GPDC' in tests:
        gpdc = GPDC()
        tests_to_evaluate.append(['GPDC', gpdc])
    if 'ParCorr' in tests:
        parcorr = ParCorr(significance='analytic')
        tests_to_evaluate.append(['ParCorr',parcorr])
    if 'CMIknn' in tests:
        cmiknn = CMIknn()
        tests_to_evaluate.append(['CMIknn',cmiknn])


    unique_complexities = list(set(l[1] for l in dataframes))
    counts = {}
    for i in unique_complexities:
        counts[i] = 0

    for test in tests_to_evaluate:
        stop = False
        for l in max_lags:
            for a in alpha:
                while not stop:
                    try:
                        i = random.sample(dataframes,1)[0]
                        if counts[i[1]] < limit:
                            print('evaluating: ' + str(i[3]))
                            start = time.time()
                            pcmci = PCMCI(
                                    dataframe=i[2],
                                    cond_ind_test=test[1],
                                    verbosity=0)
                             # correlations = pcmci.get_lagged_dependencies(tau_max=20)
                            pcmci.verbosity = 1
                            results = pcmci.run_pcmci(tau_max=l, pc_alpha=a)
                            time_lapse = round(time.time() - start, 2)

                            q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh')
                            valid_parents = list(pcmci.return_significant_parents(pq_matrix=q_matrix,
                                                                                  val_matrix=results['val_matrix'],
                                                                                  alpha_level=a)['parents'].values())

                            flat_list = []
                            for sublist in valid_parents:
                                for item in sublist:
                                    flat_list.append(item)

                            valid_links = len(flat_list)

                            test_results.append([i[3], i[0], i[1], l,test[0],a,valid_links,time_lapse])

                            results_df = pd.DataFrame(test_results,
                                                              columns=['representation', 'complexity', 'sample_size', 'max_lag','test','alpha','valid_links_at_alpha',
                                                                       'learning_time'])
                            print('results ready to be saved')
                            results_df.to_csv(
                                        'results/performance_sample_sizes.csv',
                                        index=False)

                            counts[i[1]] += 1
                            if all(value == limit for value in counts.values()):
                                stop = True

                    except:
                        print('Hoopla!')
                        pass

                for i in unique_complexities:
                    counts[i] = 0
Esempio n. 5
0
def run_pcmci(data, data_mask, var_names, path_outsub2, s, tau_min=0, tau_max=1, 
              pc_alpha=None, alpha_level=0.05, max_conds_dim=4, max_combinations=1, 
              max_conds_py=None, max_conds_px=None, verbosity=4):
    

    
    #%%
    if path_outsub2 is not False:
        txt_fname = os.path.join(path_outsub2, f'split_{s}_PCMCI_out.txt')
#        from contextlib import redirect_stdout
        orig_stdout = sys.stdout
        # buffer print statement output to f
        sys.stdout = f = io.StringIO()
    #%%            
    # ======================================================================================================================
    # tigramite 4
    # ======================================================================================================================

    T, N = data.shape # Time, Regions
    # ======================================================================================================================
    # Initialize dataframe object (needed for tigramite functions)
    # ======================================================================================================================
    dataframe = pp.DataFrame(data=data, mask=data_mask, var_names=var_names)
    # ======================================================================================================================
    # pc algorithm: only parents for selected_variables are calculated
    # ======================================================================================================================

    parcorr = ParCorr(significance='analytic',
                      mask_type='y',
                      verbosity=verbosity)
    #==========================================================================
    # multiple testing problem:
    #==========================================================================
    pcmci   = PCMCI(dataframe=dataframe,
                    cond_ind_test=parcorr,
                    selected_variables=None,
                    verbosity=verbosity)

    # selected_variables : list of integers, optional (default: range(N))
    #    Specify to estimate parents only for selected variables. If None is
    #    passed, parents are estimated for all variables.

    # ======================================================================================================================
    #selected_links = dictionary/None
    results = pcmci.run_pcmci(tau_max=tau_max, pc_alpha=pc_alpha, tau_min=tau_min,
                              max_conds_dim=max_conds_dim, 
                              max_combinations=max_combinations,
                              max_conds_px=max_conds_px,
                              max_conds_py=max_conds_py)

    q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh')

    pcmci.print_significant_links(p_matrix=results['p_matrix'],
                                   q_matrix=q_matrix,
                                   val_matrix=results['val_matrix'],
                                   alpha_level=alpha_level)
    #%%
    if path_outsub2 is not False:
        file = io.open(txt_fname, mode='w+')
        file.write(f.getvalue())
        file.close()
        f.close()

        sys.stdout = orig_stdout


    return pcmci, q_matrix, results
Esempio n. 6
0
def caus_gpdc(data, var_names):
    import numpy as np
    import matplotlib as mpl
    from matplotlib import pyplot as plt
    import sklearn

    import tigramite
    from tigramite import data_processing as pp
    from tigramite import plotting as tp
    from tigramite.pcmci import PCMCI
    from tigramite.independence_tests import ParCorr, GPDC, CMIknn, CMIsymb
    from tigramite.models import LinearMediation, Prediction

    data_mask_row = np.zeros(len(data))
    for i in range(68904):
        if (i % 72) < 30 or (i % 72) > 47:
            data_mask_row[i] = True
    data_mask = np.zeros(data.shape)

    data_mask[:, 0] = data_mask_row
    data_mask[:, 1] = data_mask_row
    data_mask[:, 2] = data_mask_row
    data_mask[:, 9] = data_mask_row
    data_mask[:, 10] = data_mask_row
    data_mask[:, 11] = data_mask_row

    dataframe = pp.DataFrame(data, mask=data_mask)
    datatime = np.arange(len(data))

    # tp.plot_timeseries(data, datatime, var_names, use_mask=True,
    #                    mask=data_mask, grey_masked_samples='data')

    gpdc = GPDC(significance='analytic',
                gp_params=None,
                use_mask=True,
                mask_type='y')
    gpdc.generate_and_save_nulldists(sample_sizes=range(495, 501),
                                     null_dist_filename='dc_nulldists.npz')
    gpdc.null_dist_filename = 'dc_nulldists.npz'
    pcmci_gpdc = PCMCI(dataframe=dataframe,
                       cond_ind_test=gpdc,
                       var_names=var_names,
                       verbosity=1)

    # correlations = pcmci.get_lagged_dependencies(tau_max=20)
    # lag_func_matrix = tp.plot_lagfuncs(val_matrix=correlations,
    #                                    setup_args={'var_names':var_names,
    #                                    'x_base':5, 'y_base':.5})

    results = pcmci_gpdc.run_pcmci(tau_max=6, tau_min=1, pc_alpha=0.01)

    # print("p-values")
    # print (results['p_matrix'].round(3))
    # print("MCI partial correlations")
    # print (results['val_matrix'].round(2))

    q_matrix = pcmci_gpdc.get_corrected_pvalues(p_matrix=results['p_matrix'],
                                                fdr_method='fdr_bh')
    pcmci_gpdc._print_significant_links(p_matrix=results['p_matrix'],
                                        q_matrix=q_matrix,
                                        val_matrix=results['val_matrix'],
                                        alpha_level=0.01)

    link_matrix = pcmci_gpdc._return_significant_parents(
        pq_matrix=q_matrix, val_matrix=results['val_matrix'],
        alpha_level=0.01)['link_matrix']

    tp.plot_time_series_graph(
        val_matrix=results['val_matrix'],
        link_matrix=link_matrix,
        var_names=var_names,
        link_colorbar_label='MCI',
    )
    return results, link_matrix