def get_significance_and_changes_for_months(months = range(1, 13),
                                            folder_path = "data/streamflows/hydrosheds_euler9"
                                            ):
    """
    returns boolean vector of the size n_grid_cells where True means
    significant change and False not significant
    and
    the percentage of changes with respect to the current mean
    """

    current_means = []
    future_means = []


    id_to_file_path = {}
    for the_id in members.all_members:
        for file_name in os.listdir(folder_path):
            if file_name.startswith( the_id ):
                id_to_file_path[the_id] = os.path.join(folder_path, file_name)




    for the_id in members.all_current:
        current_file = id_to_file_path[the_id]
        streamflows, times, i_indices, j_indices = data_select.get_data_from_file(current_file)
        current_mean_dict = data_select.get_means_over_months_for_each_year(times, streamflows, months=months)
        current_means.extend(current_mean_dict.values())

        future_file = id_to_file_path[members.current2future[the_id]]
        streamflows, times, i_indices, j_indices = data_select.get_data_from_file(future_file)
        future_mean_dict = data_select.get_means_over_months_for_each_year(times, streamflows, months=months)
        future_means.extend(future_mean_dict.values())




    current_means = np.array( current_means )
    future_means = np.array( future_means )



    print future_means.shape

    t, p = stats.ttest_ind(current_means, future_means, axis = 0)

    is_sign = p < 0.05 #significance to the 5% confidence level

    current_means = np.mean(current_means, axis=0)
    future_means = np.mean(future_means, axis=0)

    print future_means.shape
    print "number of significant points = ", sum( map(int, is_sign) )
    return is_sign, (future_means - current_means) / current_means * 100.0

    pass
def do_bootstrap_for_simulation_mean(sim_id = "aet", folder_path = "data/streamflows/hydrosheds_euler9",
                                     months = range(1, 13), n_samples = 1000):

    """
    returns the object containing means for the domain and standard deviations from bootstrap
    """
    cache_file = _get_cache_file_path(sim_id=sim_id, months=months)
    if os.path.isfile(cache_file):
       return pickle.load(open(cache_file))


    #determine path to the file with data
    filePath = None
    for f in os.listdir(folder_path):
        if f.startswith(sim_id):
            filePath = os.path.join(folder_path, f)
            break

    streamflow, times, i_indices, j_indices = data_select.get_data_from_file(filePath)

    #for each year and for each gridcell get mean value for the period
    means_dict = data_select.get_means_over_months_for_each_year(times, streamflow, months = months)

    means_sorted_in_time = map( lambda x : x[1], sorted(means_dict.items(), key=lambda x: x[0]) )
    data_matrix = np.array(means_sorted_in_time)
    print "data_matrix.shape = ", data_matrix.shape

    #generate indices
    index_matrix = np.random.rand(n_samples, data_matrix.shape[0])
    index_matrix *= (data_matrix.shape[0] - 1)
    index_matrix =  index_matrix.round().astype(int)

    means_matrix = np.zeros((n_samples, streamflow.shape[1])) #n_samples x n_points
    for i in xrange(n_samples):
        means_matrix[i,:] = np.mean(data_matrix[index_matrix[i,:],:], axis = 0)


    m_holder = MeansAndDeviation(sim_id=sim_id,
                                 means_for_domain=np.mean(data_matrix, axis = 0),
                                 standard_devs_for_domain=np.std(means_matrix, axis = 0))

    pickle.dump(m_holder, open(cache_file, mode="w"))
    return m_holder
def get_std_and_mean_using_bootstrap_for_merged_means(sim_ids = None, folder_path = "data/streamflows/hydrosheds_euler9",
                                     months = range(1, 13), n_samples = 1000):

    """
    returns the object containing means for the domain and standard deviations from bootstrap
    """
    cache_file = _get_cache_file_path(months=months, sim_ids = sim_ids)
    if os.path.isfile(cache_file):
       return pickle.load(open(cache_file))


    #determine path to the file with data
    filePaths = []
    for f in os.listdir(folder_path):
        if f.split("_")[0] in sim_ids:
            filePath = os.path.join(folder_path, f)
            filePaths.append(filePath)



    boot_means = []
    real_means = []
    index_matrix = None

    all_means = []
    members_boot_means = []
    for file_path in filePaths:
        streamflow, times, i_indices, j_indices = data_select.get_data_from_file(file_path)

        #for each year and for each gridcell get mean value for the period
        means_dict = data_select.get_means_over_months_for_each_year(times, streamflow, months = months)

        means_sorted_in_time = map( lambda x : x[1], sorted(means_dict.items(), key=lambda x: x[0]) )
        data_matrix = np.array(means_sorted_in_time)


        real_means.append(data_matrix) #save modelled means, in order to calculate mean of the merged data
        #print "data_matrix.shape = ", data_matrix.shape
        boot_means = []
        for i in xrange(n_samples):
            #generate indices
            index_vector = np.random.randint(0, data_matrix.shape[0], data_matrix.shape[0])

            #average 30 bootstrapped annual means
            boot_means.append( np.mean(data_matrix[index_vector,:], axis = 0) )
    
        members_boot_means.append( boot_means )
    
    #take average over members
    print np.array(members_boot_means).shape
    boot_means = np.array(members_boot_means).mean(axis = 0) #nsamples x npoints

    print boot_means[:, 499]
    print boot_means[:, 19]
    assert boot_means.shape[0] == n_samples, boot_means.shape

    print "boot_means.shape = ", boot_means.shape
    std_result = np.std(boot_means, axis = 0)
    mean_result = np.array(real_means).mean(axis = 0).mean(axis = 0)
    pickle.dump([std_result, mean_result], open(cache_file, mode="w"))
    return std_result, mean_result