Exemple #1
0
def find_pvalue_violation_indices_continuous(n, U, S, R, max_pvalue,
                                             max_pvalue_policy):
    pvalue_violation_indices = []

    if max_pvalue_policy == "all":
        for i in range(n - 1):
            for t in range(i + 1):
                u = U[i][t]
                s = S[i][t]
                r = R[i][t]
                for j in range(i + 1, n):
                    for k in range(i + 1, j + 1):
                        u2 = U[j][k]
                        s2 = S[j][k]
                        r2 = R[j][k]
                        if stats.ttest_ind_from_stats(u, s, r, u2, s2, r2,
                                                      False)[1] > max_pvalue:
                            pvalue_violation_indices.append(([i, t], [j, k]))

    elif max_pvalue_policy == "consecutive":
        for i in range(n - 1):
            for k in range(i + 1):
                u = U[i][k]
                s = S[i][k]
                r = R[i][k]
                for j in range(i + 1, n):
                    u2 = U[j][i + 1]
                    s2 = S[j][i + 1]
                    r2 = R[j][i + 1]
                    if stats.ttest_ind_from_stats(u, s, r, u2, s2, r2,
                                                  False)[1] > max_pvalue:
                        pvalue_violation_indices.append(([i, k], [j, i + 1]))

    return pvalue_violation_indices
Exemple #2
0
def t_test(group1, group2):
    mean1 = np.mean(group1)
    mean2 = np.mean(group2)
    std1 = np.std(group1)
    std2 = np.std(group2)
    nobs1 = len(group1)
    nobs2 = len(group2)

    modified_std1 = np.sqrt(np.float32(nobs1) / np.float32(nobs1 - 1)) * std1
    modified_std2 = np.sqrt(np.float32(nobs2) / np.float32(nobs2 - 1)) * std2
    #f檢定
    f1 = np.square(modified_std1) / np.square(modified_std2)
    fp = 1 - f.cdf(f1, nobs1 - 1, nobs2 - 1)
    if fp > 0.05:
        (statistic, pvalue) = stats.ttest_ind_from_stats(mean1=mean1,
                                                         std1=modified_std1,
                                                         nobs1=nobs1,
                                                         mean2=mean2,
                                                         std2=modified_std2,
                                                         nobs2=nobs2,
                                                         equal_var=True)
    else:
        (statistic, pvalue) = stats.ttest_ind_from_stats(mean1=mean1,
                                                         std1=modified_std1,
                                                         nobs1=nobs1,
                                                         mean2=mean2,
                                                         std2=modified_std2,
                                                         nobs2=nobs2,
                                                         equal_var=False)
    return [mean1, std1, mean2, std2, fp, statistic, pvalue]
Exemple #3
0
def compute_statistics(dataframe, n_iterations, run_test=True, csv_out='comparison_models.csv'):
    """Compares the performance of models at inference time on a common testing dataset using paired t-tests.

    It uses a dataframe generated by ``scripts/automate_training.py`` with the parameter ``--run-test`` (used to run the
    models on the testing dataset). It output dataframes that stores the different statistic (average, std and p_value
    between runs). All can be combined and stored in a csv.

    .. csv-table:: Example of dataframe
       :file: ../../images/df_compare.csv

    Usage example::

        ivadomed_compare_models -df results.csv -n 2 --run_test

    Args:
        dataframe (pandas.Dataframe): Dataframe of results generated by automate_training. Flag: ``--dataframe``, ``-df``
        n_iterations (int): Indicates the number of time that each experiment (ie set of parameter) was run.
                            Flag: ``--n_iteration``, ``-n``
        run_test (int): Indicates if the comparison is done on the performances on either the testing subdataset (True)
            either on the training/validation subdatasets. Flag: ``--run_test``
        csv_out (string): Output csv name to store computed value (e.g., df.csv). Default value is model_comparison.csv. Flag ``-o``, ``--output``
    """
    avg = dataframe.groupby(['path_output']).mean()
    std = dataframe.groupby(['path_output']).std()

    print("Average dataframe")
    print(avg)
    print("Standard deviation dataframe")
    print(std)

    config_logs = list(avg.index.values)
    p_values = np.zeros((len(config_logs), len(config_logs)))
    i, j = 0, 0
    for confA in config_logs:
        j = 0
        for confB in config_logs:
            if run_test:
                p_values[i, j] = ttest_ind_from_stats(mean1=avg.loc[confA]["test_dice"],
                                                      std1=std.loc[confA]["test_dice"],
                                                      nobs1=n_iterations, mean2=avg.loc[confB]["test_dice"],
                                                      std2=std.loc[confB]["test_dice"], nobs2=n_iterations).pvalue
            else:
                p_values[i, j] = ttest_ind_from_stats(mean1=avg.loc[confA]["best_validation_dice"],
                                                      std1=std.loc[confA]["best_validation_dice"],
                                                      nobs1=n_iterations, mean2=avg.loc[confB]["best_validation_dice"],
                                                      std2=std.loc[confB]["best_validation_dice"],
                                                      nobs2=n_iterations).pvalue
            j += 1
        i += 1

    p_df = pd.DataFrame(p_values, index=config_logs, columns=config_logs)
    print("P-values dataframe")
    print(p_df)
    if csv_out is not None:
        # Unnamed 0 column correspond to run number so we remoe that and add prefix for better readability
        df_concat = pd.concat([avg.add_prefix('avg_').drop(['avg_Unnamed: 0'], axis=1),
                               std.add_prefix('std_').drop(['std_Unnamed: 0'], axis=1), p_df.add_prefix('p-value_')],
                              axis=1)
        df_concat.to_csv(csv_out)
def gradeStats(coursetitle, dataframe):
    index = dataframe.columns.values.tolist().index(coursetitle)
    
    green_total = 0
    gold_total = 0
    black_total = 0

    green_count = 0
    gold_count = 0
    black_count = 0

    green_values = []
    gold_values = []
    black_values = []

    for row in grid:
        if row[index] == row[index]: \
            #and row[index] != 0: #include to remove pass/fail grades
            if row[0] == 'green':
                green_total += row[index]
                green_count += 1
                green_values += [row[index]]
            elif row[0] == 'gold':
                gold_total += row[index]
                gold_count += 1
                gold_values += [row[index]]
            elif row[0] == 'black':
                black_total += row[index]
                black_count += 1
                black_values += [row[index]]

    try: green_avg = green_total/green_count
    except: green_avg = 0
    try: gold_avg = gold_total/gold_count
    except: gold_avg = 0
    try: black_avg = black_total/black_count
    except: black_avg = 0
    
    green_stdev = np.std(green_values)
    gold_stdev = np.std(gold_values)
    black_stdev = np.std(black_values)

    green_gold_ttest = stats.ttest_ind_from_stats(green_avg, green_stdev, green_count, gold_avg, gold_stdev, gold_count)
    green_black_ttest = stats.ttest_ind_from_stats(green_avg, green_stdev, green_count, black_avg, black_stdev, black_count)
    gold_black_ttest = stats.ttest_ind_from_stats(gold_avg, gold_stdev, gold_count, black_avg, black_stdev, black_count)

    #print('green average grade in %sis ' % coursetitle, green_avg)
    #print('gold  average grade in %sis ' % coursetitle, gold_avg)
    #print('black average grade in %sis ' % coursetitle, black_avg)
    print('green stdev in %sis ' % coursetitle, green_stdev)
    print('gold  stdev in %sis ' % coursetitle, gold_stdev)
    print('black stdev in %sis ' % coursetitle, black_stdev)
    print()
    print('green/gold has p = ', green_gold_ttest[1])
    print('green/black has p = ', green_black_ttest[1])
    print('gold/black has p = ', gold_black_ttest[1])
    print()
Exemple #5
0
def calculate_stats():
    positive_stats = ps = pd.read_csv('data/positive.csv').drop(['Unnamed: 0'],
                                                                axis=1)
    negative_stats = ns = pd.read_csv('data/negative.csv').drop(['Unnamed: 0'],
                                                                axis=1)
    neither_stats = nes = pd.read_csv('data/neither.csv').drop(['Unnamed: 0'],
                                                               axis=1)

    print()
    names = [
        'num_retweets', 'num_retweets_norm', 'num_faves', 'num_faves_norm',
        'steps', 'followers', 'sentiment'
    ]
    # print('-------------------------Mean:-------------------------')
    # for i in range(0, 6):
    #     print('###', names[i], '###')
    #     print('Positive:', np.mean((np.array(positive_stats)[:, i]).astype(np.float)))
    #     print('Negative:', np.mean((np.array(negative_stats)[:, i]).astype(np.float)))
    #     print('Neutral', np.mean((np.array(neither_stats)[:, i]).astype(np.float)))
    #
    # print('-------------------------Total:-------------------------')
    # for i in range(0, 6):
    #     print('###', names[i], '###')
    #     print('Positive:', np.sum((np.array(positive_stats)[:, i]).astype(np.float)))
    #     print('Negative:', np.sum((np.array(negative_stats)[:, i]).astype(np.float)))
    #     print('Neutral', np.sum((np.array(neither_stats)[:, i]).astype(np.float)))

    # print('-------------------------Variance:-------------------------')
    # for i in range(0, 6):
    #     print('###', names[i], '###')
    #     print('Positive:', np.var((np.array(positive_stats)[:, i]).astype(np.float)))
    #     print('Negative:', np.var((np.array(negative_stats)[:, i]).astype(np.float)))
    #     print('Neutral', np.var((np.array(neither_stats)[:, i]).astype(np.float)))

    print('-------------------------T - test:-------------------------')
    for i in range(0, 6):
        print('###', names[i], '###')

        m1 = np.mean((np.array(positive_stats)[:, i]).astype(np.float))
        std1 = np.std((np.array(positive_stats)[:, i]).astype(np.float))
        num1 = len(np.array(positive_stats))

        m2 = np.mean((np.array(negative_stats)[:, i]).astype(np.float))
        std2 = np.std((np.array(negative_stats)[:, i]).astype(np.float))
        num2 = len(np.array(negative_stats))

        m3 = np.mean((np.array(neither_stats)[:, i]).astype(np.float))
        std3 = np.std((np.array(neither_stats)[:, i]).astype(np.float))
        num3 = len(np.array(neither_stats))

        print('1', ss.ttest_ind_from_stats(m1, std1, num1, m2, std2, num2))
        print('2', ss.ttest_ind_from_stats(m1, std1, num1, m3, std3, num3))
        print('3', ss.ttest_ind_from_stats(m2, std2, num2, m3, std3, num3))
def extract_poly_values(site, sensor, folder, all_touched=True, print_p=False):
    from scipy.stats import ttest_ind_from_stats
    # Extract values within plots from images
    # test cultural vs natural
    viFolder = os.path.join('/Volumes/RASMUS_1/Satellite/remains_sites',site,sensor,folder)
    imgList = glob.glob(viFolder + '/*.tif')

    results_natural = {}
    results_cultural = {}

    for img in imgList:
        # calculate natural background
        inShpNat = '/Volumes/RASMUS_1/Satellite/analysis/shapefiles/'+site+'_natural_'+sensor+'_poly.shp'
        natural = extract_from_img(inShpNat, img, band=1, all_touched=all_touched)
        # calculate cultural
        inShpCul = '/Volumes/RASMUS_1/Satellite/analysis/shapefiles/'+site+'_cultural_'+sensor+'_poly.shp'
        cultural = extract_from_img(inShpCul, img, band=1, all_touched=all_touched)

        imgSplit = img.split('_')
        vi = imgSplit[len(imgSplit)-1][:-4]
        results_natural[vi] = natural[0]
        results_cultural[vi] = cultural[0]
    pList = []

    for key in results_natural:
        if print_p:
            print key
        t, p = ttest_ind_from_stats(results_natural[key]['mean'], results_natural[key]['std'],
                                          results_natural[key]['count'], results_cultural[key]['mean'],
                                          results_cultural[key]['std'], results_cultural[key]['count'], equal_var=False)
        pList.append(p)
    return results_natural,results_cultural,pList
def _is_significant_slice(slice_metric: float, slice_std_dev: float,
                          slice_weight: float, base_metric: float,
                          base_std_dev: float, base_weight: float,
                          comparison_type: Text,
                          alpha: float) -> Tuple[bool, float]:
    """Perform statistical significance testing."""
    assert base_std_dev > 0, ('base_std_dev must be positive, but got '
                              '{}.'.format(base_std_dev))
    assert slice_std_dev > 0, ('slice_std_dev must be positive, but got '
                               '{}.'.format(slice_std_dev))
    assert base_weight > 1, ('base_weight must be greater than 1, but got '
                             '{}.'.format(base_weight))
    assert slice_weight > 1, ('slice_weight must be greater than 1, but got '
                              '{}.'.format(slice_weight))

    try:
        _, p_value_two_sided = stats.ttest_ind_from_stats(slice_metric,
                                                          slice_std_dev,
                                                          slice_weight,
                                                          base_metric,
                                                          base_std_dev,
                                                          base_weight,
                                                          equal_var=False)
    except ZeroDivisionError:
        raise ZeroDivisionError(
            'invalid ttest for params: slice_metric={}, '
            'slice_std_dev={}, slice_weight={}, '
            'base_metric={}, base_std_dev={}, base_weight={}, '.format(
                slice_metric, slice_std_dev, slice_weight, base_metric,
                base_std_dev, base_weight))

    metric_diff = slice_metric - base_metric
    one_sided_p_value = _two_sided_to_one_sided_pvalue(
        p_value_two_sided, metric_diff, comparison_type=comparison_type)
    return (one_sided_p_value < alpha, one_sided_p_value)
Exemple #8
0
def calc_t_pvalue(test_group_average: np.float64, test_group_stdev: np.float64,
                  test_group_nobs: np.float64,
                  control_group_average: np.float64,
                  control_group_stdev: np.float64,
                  control_group_nobs: np.float64) -> np.float64:
    """Performs the T-test to compare two averages.

  Args:
    test_group_average: Average KPI value for the test group.
    test_group_stdev: Standard deviation of KPI value for the test group.
    test_group_nobs: Number of observations in the test group.
    control_group_average: Average KPI value for the test group.
    control_group_stdev: Standard deviation of KPI value for the test group.
    control_group_nobs: Number of observations in the test group.

  Returns:
    p-value from the T-test.
  """
    _, p_val = stats.ttest_ind_from_stats(mean1=test_group_average,
                                          std1=test_group_stdev,
                                          nobs1=test_group_nobs,
                                          mean2=control_group_average,
                                          std2=control_group_stdev,
                                          nobs2=control_group_nobs,
                                          equal_var=False)
    return p_val
def print_individual_p_values():
    F_WT_err, F_WT_mean, N_WT, _, F_PC_mean, F_PC_err, N_PC, _ = errs_and_N()
    # Since we have approximately normally distributed rupture force for a
    # fixed loading rate, we can
    # use Welch's formula for getting the t-test value with different
    # population variances . See: Welch, Biometrika, 1947
    # t = mean_WT - mean_PO / sqrt( stdev_WT**2/N_WT + stdev_PC**2/N_PC)
    t_denom = np.sqrt(F_WT_err**2 / N_WT + F_PC_err**2 / N_PC)
    t = (F_WT_mean - F_PC_mean) / t_denom
    # get the degrees of freedom asscioated with the system using the
    # Welch-satterthwaite eq. See: Satterthwaite, 1946, Biometrics Bulletin
    v_denom = (F_WT_err**4 / (N_WT**2 * (N_WT - 1)) + F_PC_err**4 /
               (N_PC**2 * (N_PC - 1)))
    v = t_denom**4 / v_denom
    # determine the p value based on degrees of freedom and the t statistic
    p_value_one_sided = 1 - t_distribution.cdf(t, df=v)
    p_value_two_sided = 2 * p_value_one_sided
    # as a check, use scientific python to calculate the same thing
    t_stat_and_p = [ttest_ind_from_stats(\
        mean1=F_WT_mean[i],std1=F_WT_err[i],nobs1=N_WT[i],
        mean2=F_PC_mean[i],std2=F_PC_err[i],nobs2=N_PC[i],equal_var=False)
                    for i in range(3)]
    t_stat = [ele[0] for ele in t_stat_and_p]
    p_values = [ele[1] for ele in t_stat_and_p]
    print("Manually calculated p-values: " + \
          ",".join((["{:.3g}".format(p) for p in p_value_two_sided])))
    print("Automatically calculated p-values: " + \
          ",".join(["{:.3g}".format(p) for p in p_values]))
Exemple #10
0
def student_ttest(X,y,threshold = None, percentile = None):
    """
    perform student t-test, returen the features sorted by p-value
    """
    if threshold and percentile or (not threshold and not percentile):
        raise ValueError('error')


    labels = y.unique()
    p_feature_data = X.loc[y == labels[0],:] 
    n_feature_data = X.loc[y == labels[1],:] 

    p_mean, n_mean = p_feature_data.mean(axis = 0), n_feature_data.mean(axis = 0)
    p_std, n_std = p_feature_data.std(axis = 0), n_feature_data.std(axis = 0)

    t_value, p_value = ttest_ind_from_stats(
        p_mean, p_std, p_feature_data.shape[0], n_mean, n_std, n_feature_data.shape[0])
    p_value = pd.Series(data=p_value, index=X.columns)
    sorted_pvalue = p_value.sort_values(ascending=True)
    
    if threshold != None:
        res = sorted_pvalue[sorted_pvalue < threshold].index.tolist()

    if percentile:
        res =  sorted_pvalue.iloc[:int(sorted_pvalue.shape[0] * percentile)].index.tolist()

    return res
Exemple #11
0
def ttest_sub(mean_1,
              std_1,
              nyears_1,
              mean_2,
              std_2,
              nyears_2,
              equal_var=True):
    """
    Sub-routine to call ttest_ind_from_stats from scipy
    Checks that shapes match and turns integer years into correct format
    returns pvalue.
    """

    # Convert nobs type
    nyears_1 = int(nyears_1)
    nyears_2 = int(nyears_2)

    # Create arrays like others for nobs
    nobs1_arr = (nyears_1 - 1) * np.ones_like(mean_1)
    nobs2_arr = (nyears_2 - 1) * np.ones_like(mean_1)
    """
    # ttest_ind_from_stats
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind_from_stats.html
    """

    ttest_out = ttest_ind_from_stats(mean_1, std_1, nobs1_arr, mean_2, std_2,
                                     nobs2_arr)

    # An array of p-values matching the shape of the input arrays
    pvalue_out = ttest_out[1]

    return pvalue_out
Exemple #12
0
def simulate_binned_t_test(lmb1, t1, lmb2, t2, bin_size=1.0):
    """
    Simulates data from two Poisson distributions,
    bins them as per the bin-size and finds the p-value
    by passing the binned AIR estimate vectors to a two-
    sided t-test.
    args:
        lmb1: The failure rate for first population.
        t1: The time observed for first population.
        lmb2: The failure rate for second population.
        t2: The time observed for second population
        bin_size: The bins into which data is partitioned.
    """
    num_bins1 = int(t1 / bin_size)
    num_bins2 = int(t2 / bin_size)
    if num_bins1 < 2 or num_bins2 < 2:
        print("Not enough bins!")
        return
    n1 = poisson.rvs(lmb1 * t1 / num_bins1, size=num_bins1)
    n2 = poisson.rvs(lmb2 * t2 / num_bins2, size=num_bins2)
    mean1 = np.mean(n1 / bin_size)
    std1 = np.std(n1 / bin_size)
    mean2 = np.mean(n2 / bin_size)
    std2 = np.std(n2 / bin_size)
    p_val = stats.ttest_ind_from_stats(mean1=mean1, std1=std1, nobs1=20, \
                        mean2=mean2, std2=std2, nobs2=20, \
                        equal_var=False).pvalue/2
    return p_val
Exemple #13
0
def find_min_errors(summary_df, agg_level_name, ttest_pval_th=0.95):
    """
    Runs a pair-wise Welch's t-test between minimum 'mean' value of each group of agg_level_name.
    For any row whose mean is NOT significanly different than the minimum error in its group,
    is_min will be set to True.   
      
    Args:
        summary_df [In/Out] is an aggregate on a dataframe. THe aggregate should have a "mean", "std" and  "count".
        agg_level_name: a level name in summary_df
        ttest_pval_th: pvalue threshold (default: 0.95) 
    """
    from scipy.stats import ttest_ind_from_stats
    summary_df["is_min"] = False

    # Find minimum "mean" for each level and its corresponding std and count
    min_at_level = summary_df.groupby(agg_level_name)["mean", "std",
                                                      "count"].transform("min")
    for index, row in summary_df.iterrows():
        t_val, p_val = ttest_ind_from_stats(
            mean1=min_at_level.loc[index]["mean"],
            std1=min_at_level.loc[index]["std"],
            nobs1=min_at_level.loc[index]["count"],
            mean2=row["mean"],
            std2=row["std"],
            nobs2=row["count"],
            equal_var=False)
        if p_val >= ttest_pval_th:
            summary_df.at[index, "is_min"] = True
        else:
            summary_df.at[index, "is_min"] = False
Exemple #14
0
def _bivariate_student_orderability_from_moments(mu, var, nTrial, type="stat"):
    '''
    :param mu:       1D Array of temporal means for every channel
    :param var:      1D Array of temporal variances for every channel
    :param nTrial:   number of trials used to compute temporal moments. Necessary for T-test
    :param type:     Depending on output type, returns either T-Test statistic or p-value
    :return:         Scalar orderability index
    '''

    rezidx = 0 if type == "stat" else 1

    nNode = len(mu)
    std = np.sqrt(var)

    rez = np.full((nNode, nNode), np.nan)
    for i in range(nNode):
        for j in range(i + 1, nNode):
            rez[i][j] = ttest_ind_from_stats(mu[i],
                                             std[i],
                                             nTrial,
                                             mu[j],
                                             std[j],
                                             nTrial,
                                             equal_var=False)[rezidx]
            rez[j][i] = rez[i][j]

    return rez
    def end_batch(self, batch_meta):
        pvalue = 1.0
        if self._history.n != 0:
            # Perform Welch's t test
            t, pvalue = stats.ttest_ind_from_stats(self._history.mean,
                                                   self._history.stddev(),
                                                   self._history.n,
                                                   self._batch.mean,
                                                   self._batch.stddev(),
                                                   self._batch.n,
                                                   equal_var=False)

            # Send pvalue point back to Kapacitor
            response = udf_pb2.Response()
            response.point.time = batch_meta.tmax
            response.point.name = batch_meta.name
            response.point.group = batch_meta.group
            response.point.tags.update(batch_meta.tags)
            response.point.fieldsDouble["t"] = t
            response.point.fieldsDouble["pvalue"] = pvalue
            self._agent.write_response(response)

        # Update historical stats with batch, but only if it was normal.
        if pvalue > self._alpha:
            for value in self._batch._window:
                self._history.update(value)
def runTTest(con_samples, exp_samples):
    con_means = []
    con_var = []
    con_n = len(con_samples[list(con_samples.keys())[0]])
    for key in con_samples:
        con_means.append(np.mean(con_samples[key]))
        con_var.append(np.var(con_samples[key]))
    con_dof = con_n - 1

    exp_means = []
    exp_var = []
    exp_n = len(exp_samples[list(exp_samples.keys())[0]])
    for key in exp_samples:
        exp_n = len(exp_samples[key])
        exp_means.append(np.mean(exp_samples[key]))
        exp_var.append(np.var(exp_samples[key]))
    exp_dof = exp_n - 1

    tstats = [0] * (len(con_means))
    pvalues = [0] * (len(con_means))

    for i in range(0, len(con_means)):
        tstats[i], pvalues[i] = ttest_ind_from_stats(con_means[i],
                                                     np.sqrt(con_var[i]),
                                                     con_n,
                                                     exp_means[i],
                                                     np.sqrt(exp_var[i]),
                                                     exp_n,
                                                     equal_var=False)

    return tstats, pvalues
Exemple #17
0
def __do_comparison(expression_values1, weights1, day1, expression_values2, weights2, day2, features,
                    fraction_expressed_ratio_add=0.0001):
    mean1 = np.average(expression_values1, weights=weights1, axis=0)
    mean2 = np.average(expression_values2, weights=weights2, axis=0)
    fraction_expressed1 = weights1.dot(expression_values1 > 0)
    fraction_expressed2 = weights2.dot(expression_values2 > 0)
    fraction_expressed_diff = (fraction_expressed1 + fraction_expressed_ratio_add) / (
            fraction_expressed2 + fraction_expressed_ratio_add)

    variance1 = np.average((expression_values1 - mean1) ** 2, weights=weights1, axis=0)
    variance2 = np.average((expression_values2 - mean2) ** 2, weights=weights2, axis=0)
    with np.errstate(invalid="ignore"):
        scores, ttest_pvals = stats.ttest_ind_from_stats(
            mean1=mean1, std1=np.sqrt(variance1), nobs1=len(weights1),
            mean2=mean2, std2=np.sqrt(variance2), nobs2=len(weights2), equal_var=False)  # Welch's
    scores[np.isnan(scores)] = 0
    ttest_pvals[np.isnan(ttest_pvals)] = 1
    fold_change = np.exp(mean1 - mean2)

    results = pd.DataFrame(index=features,
        data={'fold_change': fold_change,
              'mean1': mean1,
              'mean2': mean2,
              'fraction_expressed1': fraction_expressed1,
              'fraction_expressed2': fraction_expressed2,
              't_score': scores,
              't_pval': ttest_pvals,
              't_fdr': statsmodels.stats.multitest.multipletests(ttest_pvals)[1],
              'fraction_expressed_ratio': fraction_expressed_diff,
              'day1': day1,
              'day2': day2})

    return results
Exemple #18
0
    def _system_tests_continuous(self, bin_str, n_records_a, mean_a, std_a,
                                 n_records_e, mean_e, std_e):

        self._metric_a = mean_a
        self._metric_e = mean_e

        t_statistics = []
        p_values = []

        n_bins = len(bin_str)
        for i in range(n_bins):
            t, p = stats.ttest_ind_from_stats(mean_a[i], std_a[i],
                                              n_records_a[i], mean_e[i],
                                              std_e[i], n_records_e[i], False)

            t_statistics.append(t)
            p_values.append(p)

        df_tests = pd.DataFrame({
            "Bin": bin_str,
            "Count A": n_records_a,
            "Count E": n_records_e,
            "Mean A": mean_a,
            "Mean E": mean_e,
            "Std A": std_a,
            "Std E": std_e,
            "statistic": t_statistics,
            "p-value": p_values
        })

        self._df_tests = df_tests
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--means", nargs=2, type=float)
    parser.add_argument("--stds", nargs=2, type=float)
    parser.add_argument("--observations", nargs=2, type=int)
    parser.add_argument("--alpha", default=0.05, type=float)
    parser.add_argument("--test_type",
                        "--test-type",
                        default="greater-than",
                        choices=("greater-than", "less-than"))
    args = parser.parse_args()

    t, p = stats.ttest_ind_from_stats(args.means[0],
                                      args.stds[0],
                                      args.observations[0],
                                      args.means[1],
                                      args.stds[1],
                                      args.observations[1],
                                      equal_var=False)

    print(t)
    print(p)
    if args.test_type == "greater-than":
        print(greater_than_reject_null(t, p, args.alpha))
    else:
        print(less_than_reject_null(t, p, args.alpha))
Exemple #20
0
def compare_tests(n=1e4,
                  alpha=np.array([.01, .25, .3, .4, .45, .5]),
                  lmb=12.0,
                  mu=12.0):
    cnt = np.zeros(len(alpha))
    cnt1 = np.zeros(len(alpha))
    for _ in range(int(n)):
        t = 10e3 / 4 / 1000
        s = 10e3 / 4 / 1000
        n1s = poisson.rvs(lmb * t, size=20)
        n2s = poisson.rvs(mu * s, size=20)
        rate1 = n1s / t
        rate2 = n2s / s
        n1 = sum(n1s)
        n2 = sum(n2s)
        d = n2 / (20 * s) - n1 / (20 * t)
        lmb_mix = (n1 + n2) / (t + s) / 20
        p_val2 = pois_diff_sf(d, lmb_mix, lmb_mix, t, s)
        #if p_val2 < alpha:# and n2/s>n1/t:
        cnt1 += p_val2 < alpha
        mean1 = np.mean(rate1)
        std1 = np.std(rate1)
        mean2 = np.mean(rate2)
        std2 = np.std(rate2)
        #if mean2>mean1:
        t_score = stats.ttest_ind_from_stats(mean1=mean1, std1=std1, nobs1=20, \
                            mean2=mean2, std2=std2, nobs2=20, \
                            equal_var=False)
        #if t_score.pvalue/2 < alpha:
        cnt += t_score.pvalue / 2 < alpha
    print(cnt / n)
    print(cnt1 / n)
    def tTestPredictor(self, blueTeamList, redTeamList):  # TODO: Create testing Function
        blueStats = [0, 0, 0]
        redStats = [0, 0, 0]

        for team in blueTeamList:
            T = Team(team, self.year, self.authKey)
            teamStats = T.totalScoreStats(["mean", "std"])
            blueStats[0] += teamStats[0]
            blueStats[1] = np.sqrt(pow(blueStats[1], 2) + pow(teamStats[1], 2))
            blueStats[2] += teamStats[2]

        for team in redTeamList:
            T = Team(team, self.year, self.authKey)
            teamStats = T.totalScoreStats(["mean", "std"])
            redStats[0] += teamStats[0]
            redStats[1] = np.sqrt(pow(redStats[1], 2) + pow(teamStats[1], 2))
            redStats[2] += teamStats[2]

        tVal, p = sStats.ttest_ind_from_stats(blueStats[0], blueStats[1], blueStats[2], redStats[0], redStats[1],
                                              redStats[2])
        if tVal > 0:
            return "blue", p
        elif tVal < 0:
            return "red", p
        elif tVal == 0:
            return "neither", p
        else:
            return "-1", p
Exemple #22
0
    def score(self, X, nbhds, nn_matrix=None):
        k = len(nbhds[0])
        if nn_matrix is None:
            data = np.ones(np.sum([len(x) for x in nbhds]))
            col_ind = [item for sublist in nbhds for item in sublist]
            row_ind = [
                i for i, sublist in enumerate(nbhds) for item in sublist
            ]

            # sparse adjacency matrix of NN graph
            nn_matrix = csr_matrix((data, (row_ind, col_ind)),
                                   shape=(len(nbhds), X.shape[0]))

        # get mean gene expressions within each neighborhood; this matrix may be less sparse
        mean_nbhd_exprs = (nn_matrix * X).astype('int').multiply(
            1 / nn_matrix.sum(axis=1)).tocsr()

        vars = np.zeros((len(nbhds), X.shape[1]))
        for i in range(len(nbhds)):  # gotta go cell by cell
            nbrs = np.array(nbhds[i]).flatten()
            gene_diffs = np.power(
                (X[nbrs, :].todense() - mean_nbhd_exprs[i, :].todense()),
                2)  # diffs of gene expression
            vars[i, :] = gene_diffs.mean(axis=0)
        vars = csr_matrix(vars)

        global_means = np.tile(X.mean(axis=0), (len(nbhds), 1))

        # sign is pos if mean is higher, negative otherwise.
        signs = 2 * (mean_nbhd_exprs.todense() >=
                     global_means).astype('int') - 1

        global_var = np.tile(np.var(X.todense(), axis=0), (len(nbhds), 1))
        nobs_global = np.tile(X.shape[0], (len(nbhds), X.shape[1]))
        nobs_local = np.tile(k, (len(nbhds), X.shape[1]))

        wts = ttest_ind_from_stats(
            mean1=mean_nbhd_exprs.todense().flatten(),
            std1=np.array(np.sqrt(vars.todense()).flatten()),
            nobs1=np.array(nobs_local).flatten(),
            mean2=np.array(global_means).flatten(),
            std2=np.array(np.sqrt(global_var)).flatten(),
            nobs2=np.array(nobs_global).flatten()).pvalue.reshape(
                (len(nbhds), X.shape[1]))

        np.nan_to_num(wts, copy=False, nan=1.0)  # nans become pval 1

        wts[wts == 0] = sys.float_info.min  # remove zeros

        if self.corrector is not None:
            wts = self.corrector.correct(wts)

        wts = -1 * np.log(wts)  # convert to info

        np.nan_to_num(wts, copy=False, nan=1.0)  # nans become pval 1

        wts = np.multiply(signs, wts)  # negative if underexpressed

        return (csr_matrix(wts))
Exemple #23
0
 def welchsTest(self, mean1, mean2, std1, std2, sampleSize1, sampleSize2):
     try:
         t = stats.ttest_ind_from_stats(
             mean1, std1, sampleSize1, mean2, std2, sampleSize2,
             False).statistic  #False means the variances are unequal
         return t if t != np.nan else mean1 > mean2
     except:
         return 0.0
def run_studenttest3(dataset_1, dataset_2):
    mean1 = np.mean(dataset_1, axis=0)
    mean2 = np.mean(dataset_2, axis=0)
    std1 = np.std(dataset_1, axis=0)
    std2 = np.std(dataset_1, axis=0)
    nobs1 = len(dataset_1)
    nobs2 = len(dataset_2)
    t, p = stats.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, equal_var=True)
    return p
def solution(array1, array2, s_level=0.05):
    '''
        True, if set1 and set2 are significantly different.
        False, if they are significantly not different.
    '''
    # Convert into np array
    array1 = np.array(array1)
    array2 = np.array(array2)

    # Compute the descriptive statistics of a and b.
    array1_bar = array1.mean()
    array1_var = array1.var(ddof=1)
    array1_N = array1.size  # number of elements
    array1_dof = array1_N - 1  # Degrees of freedom

    array2_bar = array2.mean()
    array2_var = array2.var(ddof=1)
    array2_N = array2.size
    array2_dof = array2_N - 1

    # Method 1
    # Use scipy.stats.ttest_ind.
    result = ttest_ind(a=array1, b=array2)
    t_statistics = result[0]
    p_value = result[1]
    print("Using scipy.stats.ttest_ind.: t = %g  p = %g" %
          (t_statistics, p_value))

    # Method 2
    # Use scipy.stats.ttest_ind_from_stats.
    t_statistics, p_value = ttest_ind_from_stats(array1_bar,
                                                 np.sqrt(array1_var),
                                                 array1_N,
                                                 array2_bar,
                                                 np.sqrt(array2_var),
                                                 array2_N,
                                                 equal_var=False)
    print("Using ttest_ind_from_stats: t = %g  p = %g" %
          (t_statistics, p_value))

    # Method 3
    # Use the formulas directly.
    t_statistics = (array1_bar - array2_bar) / np.sqrt(array1_var / array1_N +
                                                       array2_var / array2_N)
    dof = (array1_var / array1_N + array2_var / array2_N)**2 / (
        array1_var**2 / (array1_N**2 * array1_dof) + array2_var**2 /
        (array2_N**2 * array2_dof))
    p_value = 2 * stdtr(dof, -np.abs(t_statistics))
    print("Using formulas : t = %g  p = %g" % (t_statistics, p_value))

    if p_value > (s_level / 2):
        # Failed to reject null. Both means are not significantly different.
        return False
    else:
        # Reject Null. Both means are significantly different.
        return True
Exemple #26
0
def gen_voxel_msn(centers_list, label_eg, label_cg):
    for center in centers_list:
        mean_eg, std_eg, count_eg = get_center_voxel_msn_by_label(center, label_eg)
        mean_cg, std_cg, count_cg = get_center_voxel_msn_by_label(center, label_cg)
        
        if count_eg and count_cg:
            t, p = ttest_ind_from_stats(mean_eg, std_eg, count_eg,
                                     mean_cg, std_cg, count_cg)

    return t
Exemple #27
0
def plot_results(df, sid):
    simple = df.loc[df.arch.str.contains("simple"), ["arch",  "FoldChange_Med", "yerr",
                                         "log2_FCmed", "log2yerr", "Observed",
                                         "fold_change_std"]]
    simMed, simErr = simple.FoldChange_Med, simple.yerr

    complexenh= df.loc[df.arch.str.contains("complex"), ["arch",  "FoldChange_Med", "yerr",
                                                 "log2_FCmed", "log2yerr", "Observed",
                                                "fold_change_std"]]
    comMed, comErr = complexenh.FoldChange_Med, complexenh.yerr

    sids = simple.arch

    ind = np.arange(len(comMed))  # the x locations for the groups
    width = 0.2  # the width of the bars
    barWidth = 0.25


    fig, ax = plt.subplots(figsize = (6,6))
    # Set position of bar on X axis
    r1 = np.arange(len(comMed))
    r2 = [x + barWidth for x in r1]


    # Make the plot
    plt.bar(r1, simMed, color=amber, width=barWidth, edgecolor='white', label='simple', yerr =simErr)
    plt.bar(r2, comMed, color=faded_green, width=barWidth, edgecolor='white', label='complexenh', yerr = comErr)

    result, p = stats.ttest_ind_from_stats(mean1 = simple.FoldChange_Med.item(), std1 =simple.fold_change_std.item(), nobs1 = simple.Observed.item(),
                mean2 = complexenh.FoldChange_Med.item(), std2 = complexenh.fold_change_std.item(), nobs2 = complexenh.Observed.item(),
                                       equal_var = False)
    plt.xlabel("%s, p = %s" % (sid,p))
    plt.ylabel("Fold-change")
    plt.xticks([r + barWidth for r in range(len(comMed))], sids, fontsize = 14)

    from matplotlib.ticker import MultipleLocator
    import matplotlib.ticker as ticker
    #ticks = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(2**x))

    #ax.yaxis.set_major_formatter(ticks)
    for p in ax.patches:
        ax.annotate("%.2fx" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()-0.05),
                 ha='left', va='bottom', color='gray', xytext=(0, 10),
                 textcoords='offset points')

    # Create legend & Show graphic
    plt.legend(bbox_to_anchor = (1.45,1))

    ax.yaxis.set_major_locator(MultipleLocator(1))

    sns.set("poster")

    plt.savefig("%sfig4-ROADMAP_%s_matched_GWAS_2019_LDex_p5e-8_untrimmed.pdf"%(RE, sid))
    plt.show()
Exemple #28
0
def two_sample_ttest_descriptive_statistic(mean1, std1, n1, mean2, std2, n2, equal_var=True):
    """
        T-test for means of two independent samples from descriptive statistics.
        This is a two-sided test for the null hypothesis that 2 independent samples have identical average (expected) values.
    :param data1:
    :param data2:
    :return:

    """
    result_test = stats.ttest_ind_from_stats(mean1, std1, n1, mean2, std2, n2, equal_var)
    return result_test
def ttest(groups, gm1, gm2, metric):
    df1 = groups.get_group(gm1)
    df2 = groups.get_group(gm2)

    tstats = stats.ttest_ind_from_stats(mean1=df1[metric].mean(),
                                        std1=df1[metric].std(),
                                        nobs1=len(df1),
                                        mean2=df2[metric].mean(),
                                        std2=df2[metric].std(),
                                        nobs2=len(df2))
    return tstats
def generate_tables(pickle_files, main_body):

    # get raw results
    raw_table = raw_result_table(pickle_files, main_body)
    raw_table = raw_table.replace('V3AE-Uniform', 'V3AE-MLE')

    # aggregate processed results into a table
    table = None
    for data in raw_table['Data'].unique():

        # clean up the name
        new_name = data.replace('_', ' ')
        raw_table.loc[raw_table.Data == data, 'Data'] = new_name
        data = new_name

        # compute means and standard deviations over methods
        experiment = raw_table[raw_table.Data == data]
        groups = ['Data', 'Method'] if main_body else ['Data', 'Method', 'BatchNorm']
        mean = pd.DataFrame(experiment.groupby(groups, sort=False).mean())
        std = pd.DataFrame(experiment.groupby(groups, sort=False).std(ddof=1))

        # build string table
        df = string_table(mean.copy(deep=True), std.copy(deep=True))

        # bold winners if sufficient trials
        n_trials = max(experiment.index) + 1
        if n_trials >= 2:

            # loop over the metrics
            for (metric, order) in [('LL', 'max'), ('RMSE', 'min')]:  #, ('Entropy', 'min')]:

                # get top performer
                i_best = np.argmax(mean[metric]) if order == 'max' else np.argmin(mean[metric])

                # get null hypothesis
                null_mean = mean[metric].to_numpy()[i_best]
                null_std = std[metric].to_numpy()[i_best]

                # compute p-values
                ms = zip([m for m in mean[metric].to_numpy().tolist()], [s for s in std[metric].to_numpy().tolist()])
                p = [ttest_ind_from_stats(null_mean, null_std, n_trials, m, s, n_trials, False)[-1] for (m, s) in ms]

                # bold statistical ties for best
                for i in range(df.shape[0]):
                    if i == i_best or p[i] >= 0.05:
                        df.loc[mean[metric].index[i], metric] = '\\textbf{' + df.loc[mean[metric].index[i], metric] + '}'

        # concatenate experiment to results table
        if main_body:
            table = pd.concat([table, df.unstack(level=0).T.swaplevel(0, 1)])
        else:
            table = pd.concat([table, df])

    return table.to_latex(escape=False)
def maybe_balance(force=False):
	if(force == True):
		for train_dataset, test_dataset in zip(train_datasets, test_datasets):
			print(train_dataset + " " + test_dataset)
			with open(train_dataset, 'rb') as f:
				train_set = pickle.load(f)
				f.close()
			with open(test_dataset, 'rb') as g:
				test_set = pickle.load(g)
				g.close()
				#result = stats.ttest_ind_from_stats(-0.128243,0.443109,52912,-0.132556,0.44502,1873)
			result = stats.ttest_ind_from_stats(np.mean(train_set),np.std(train_set),train_set.shape[0],
									np.mean(test_set),np.std(test_set),test_set.shape[0])
			print(result)
Exemple #32
0
def t_test():
    data_sets = ["Airport", "Collaboration", "Congress", "Forum"]
    models = ["pWSBM_m", "pWSBM_s", "ModelR_m", "ModelR_s", "sample_size"]
    errors = pandas.DataFrame(
        [
            [0.0486, 0.0006, 0.0131, 0.001, 25],
            [0.0407, 0.0001, 0.0303, 0.001, 25],
            [0.0571, 0.0004, 0.0369, 0.003, 25],
            [0.0726, 0.0003, 0.0376, 0.001, 25],
        ],
        data_sets,
        models,
    )
    print(errors)
    errors["reduction"] = errors.apply(lambda record: (record[0] - record[2]) / record[0], axis=1)
    errors["p_value"] = errors.apply(
        lambda record: stats.ttest_ind_from_stats(record[0], record[1], record[4], record[2], record[3], record[4])[1],
        axis=1,
    )
    return errors
    def end_batch(self, batch_meta):
        pvalue = 1.0
        if self._history.n != 0:
            # Perform Welch's t test
            t, pvalue = stats.ttest_ind_from_stats(
                    self._history.mean, self._history.stddev(), self._history.n,
                    self._batch.mean, self._batch.stddev(), self._batch.n,
                    equal_var=False)


            # Send pvalue point back to Kapacitor
            response = udf_pb2.Response()
            response.point.time = batch_meta.tmax
            response.point.name = batch_meta.name
            response.point.group = batch_meta.group
            response.point.tags.update(batch_meta.tags)
            response.point.fieldsDouble["t"] = t
            response.point.fieldsDouble["pvalue"] = pvalue
            self._agent.write_response(response)

        # Update historical stats with batch, but only if it was normal.
        if pvalue > self._alpha:
            for value in self._batch._window:
                self._history.update(value)
def main():
    current_start_year = 1981
    current_end_year = 2010

    future_start_year = 2070
    future_end_year = 2099


    LABEL_CURRENT = "Current"
    LABEL_FUTURE = "Future"

    pval_crit = 0.05



    label_to_period = {
        LABEL_CURRENT: (current_start_year, current_end_year),
        LABEL_FUTURE: (future_start_year, future_end_year)
    }

    season_to_months = OrderedDict()


    selected_months = [11, 12, 1]



    for i in selected_months:
        season_to_months[calendar.month_name[i]] = [i, ]




    print(season_to_months)

    nemo_icefrac_vname = "soicecov"
    nemo_sst_vname = "sosstsst"

    vname = nemo_sst_vname


    exp_label = "cc_canesm2_nemo_offline"

    nemo_managers_coupled_cc_slices_canesm2_rcp85 = OrderedDict([
        (LABEL_CURRENT, NemoYearlyFilesManager(folder="/HOME/huziy/skynet3_rech1/CRCM5_outputs/cc_canesm2_rcp85_gl/coupled-GL-current_CanESM2/CRCMNEMO_GL_CanESM2_RCP85", suffix="grid_T.nc")),
        (LABEL_FUTURE, NemoYearlyFilesManager(folder="/HOME/huziy/skynet3_rech1/CRCM5_outputs/cc_canesm2_rcp85_gl/coupled-GL-future_CanESM2/CRCMNEMO_GL_CanESM2_RCP85_future", suffix="grid_T.nc"))
    ])

    nemo_managers_offline_cc_canesm2_rcp85 = OrderedDict([
        (LABEL_CURRENT, NemoYearlyFilesManager(folder="/HOME/huziy/skynet3_rech1/NEMO_OFFICIAL/Simulations/cc_canesm2_nemo_offline_gathered_corrected_from_guillimin", suffix="grid_T.nc")),
        (LABEL_FUTURE, NemoYearlyFilesManager(folder="/HOME/huziy/skynet3_rech1/NEMO_OFFICIAL/Simulations/cc_canesm2_nemo_offline_gathered_corrected_from_guillimin", suffix="grid_T.nc"))
    ])


    # nemo_managers = OrderedDict([
    #     (LABEL_CURRENT, NemoYearlyFilesManager(folder="/BIG1/huziy/CRCM5_NEMO_coupled_sim_nemo_outputs/NEMO", suffix="grid_T.nc")),
    #     (LABEL_FUTURE, NemoYearlyFilesManager(folder="/BIG1/huziy/CRCM5_NEMO_coupled_sim_nemo_outputs/NEMO", suffix="grid_T.nc")),
    # ])


    nemo_managers = nemo_managers_offline_cc_canesm2_rcp85


    # calculate cc for LSWT and ice cover




    # Calculate seasonal mean projected changes
    label_to_data = OrderedDict()

    lons, lats = None, None

    for label, manager in nemo_managers.items():
        assert isinstance(manager, NemoYearlyFilesManager)

        start_year, end_year = label_to_period[label]
        label_to_data[label] = manager.get_seasonal_clim_fields_with_ttest_data(start_year=start_year, end_year=end_year,
                                                                                season_to_months=season_to_months, varname=vname)

        if lons is None:
            lons, lats = manager.lons, manager.lats






    # ----------- plot the plots
    #
    plot_utils.apply_plot_params(font_size=10, width_cm=8 * len(season_to_months), height_cm=5)

    map = Basemap(llcrnrlon=-93, llcrnrlat=41, urcrnrlon=-73,
                  urcrnrlat=48.5, projection='lcc', lat_1=33, lat_2=45,
                  lon_0=-90, resolution='i', area_thresh=10000)

    xx, yy = map(lons, lats)



    fig = plt.figure()
    gs = GridSpec(nrows=1, ncols=len(season_to_months), wspace=0.02, hspace=0.02)

    for col, season in enumerate(season_to_months):
        mean_c, std_c, nobs_c = label_to_data[LABEL_CURRENT][season]
        mean_f, std_f, nobs_f = label_to_data[LABEL_FUTURE][season]


        cc = mean_f - mean_c

        tval, pval = ttest_ind_from_stats(mean_c, std_c, nobs_c, mean_f, std_f, nobs_f, equal_var=False)

        cc = np.ma.masked_where(pval > pval_crit, cc)

        clevs = vname_to_clevs_diff[vname]
        cmap = cm.get_cmap("bwr", len(clevs) - 1)
        bn = BoundaryNorm(clevs, len(clevs) - 1)

        ax = fig.add_subplot(gs[0, col])
        im = map.pcolormesh(xx, yy, cc, cmap=cmap, norm=bn, ax=ax)
        cb = map.colorbar(im, location="bottom")

        cb.ax.set_visible(col == 0)

        map.drawcoastlines(linewidth=0.3)

        ax.set_frame_on(False)
        ax.set_title(season)

        if col == 0:
            ax.set_ylabel("F - C")



    # create the image folder if it does not exist yet
    if not img_folder.exists():
        img_folder.mkdir()


    fname = "{}_{}_{}vs{}.png".format(exp_label, vname, future_start_year, future_end_year, current_start_year, current_end_year)
    fig.savefig(str(img_folder / fname), bbox_inches="tight", dpi=300)
def main():
    start_year = 1980
    end_year = 2009

    HL_LABEL = "CRCM5_HL"
    NEMO_LABEL = "CRCM5_NEMO"

    # critical p-value for the ttest aka significance level
    p_crit = 1

    vars_of_interest = [
        # T_AIR_2M,
        # TOTAL_PREC,
        # SWE,
        default_varname_mappings.LATENT_HF,
        default_varname_mappings.SENSIBLE_HF,
        default_varname_mappings.LWRAD_DOWN,
        default_varname_mappings.SWRAD_DOWN
        #       LAKE_ICE_FRACTION
    ]

    coastline_width = 0.3

    vname_to_seasonmonths_map = {
        SWE: OrderedDict([("November", [11]),
                          ("December", [12]),
                          ("January", [1, ])]),
        LAKE_ICE_FRACTION: OrderedDict([
            ("December", [12]),
            ("January", [1, ]),
            ("February", [2, ]),
            ("March", [3, ]),
            ("April", [4, ])]),
        T_AIR_2M: season_to_months,
        TOTAL_PREC: season_to_months,
    }


    # set season to months mappings
    for vname in vars_of_interest:
        if vname not in vname_to_seasonmonths_map:
            vname_to_seasonmonths_map[vname] = season_to_months


    sim_configs = {
        HL_LABEL: RunConfig(data_path="/RECH2/huziy/coupling/GL_440x260_0.1deg_GL_with_Hostetler/Samples_selected",
                            start_year=start_year, end_year=end_year, label=HL_LABEL),

        NEMO_LABEL: RunConfig(data_path="/RECH2/huziy/coupling/coupled-GL-NEMO1h_30min/selected_fields",
                              start_year=start_year, end_year=end_year, label=NEMO_LABEL),
    }

    sim_labels = [HL_LABEL, NEMO_LABEL]

    vname_to_level = {
        T_AIR_2M: VerticalLevel(1, level_kinds.HYBRID),
        U_WE: VerticalLevel(1, level_kinds.HYBRID),
        V_SN: VerticalLevel(1, level_kinds.HYBRID),
        default_varname_mappings.LATENT_HF: VerticalLevel(5, level_kinds.ARBITRARY),
        default_varname_mappings.SENSIBLE_HF: VerticalLevel(5, level_kinds.ARBITRARY),
    }

    # Try to get the land_fraction for masking if necessary
    land_fraction = None
    try:
        first_ts_file = Path(sim_configs[HL_LABEL].data_path).parent / "pm1979010100_00000000p"

        land_fraction = get_land_fraction(first_timestep_file=first_ts_file)
    except Exception as err:
        raise err
        pass

    # Calculations

    # prepare params for interpolation
    lons_t, lats_t, bsmap = get_target_lons_lats_basemap(sim_configs[HL_LABEL])

    # get a subdomain of the simulation domain
    nx, ny = lons_t.shape
    iss = IndexSubspace(i_start=20, j_start=10, i_end=nx // 1.5, j_end=ny / 1.8)
    # just to change basemap limits
    lons_t, lats_t, bsmap = get_target_lons_lats_basemap(sim_configs[HL_LABEL], sub_space=iss)

    xt, yt, zt = lat_lon.lon_lat_to_cartesian(lons_t.flatten(), lats_t.flatten())

    vname_map = {}
    vname_map.update(default_varname_mappings.vname_map_CRCM5)

    # Read and calculate simulated seasonal means
    mod_label_to_vname_to_season_to_std = {}
    mod_label_to_vname_to_season_to_nobs = {}

    sim_data = defaultdict(dict)
    for label, r_config in sim_configs.items():

        store_config = {
            "base_folder": r_config.data_path,
            "data_source_type": data_source_types.SAMPLES_FOLDER_FROM_CRCM_OUTPUT_VNAME_IN_FNAME,
            "varname_mapping": vname_map,
            "level_mapping": vname_to_level,
            "offset_mapping": default_varname_mappings.vname_to_offset_CRCM5,
            "multiplier_mapping": default_varname_mappings.vname_to_multiplier_CRCM5,
        }

        dm = DataManager(store_config=store_config)

        mod_label_to_vname_to_season_to_std[label] = {}
        mod_label_to_vname_to_season_to_nobs[label] = {}

        interp_indices = None
        for vname in vars_of_interest:

            # --
            end_year_for_current_var = end_year
            if vname == SWE:
                end_year_for_current_var = min(1996, end_year)

            # --
            seas_to_year_to_mean = dm.get_seasonal_means(varname_internal=vname,
                                                         start_year=start_year,
                                                         end_year=end_year_for_current_var,
                                                         season_to_months=vname_to_seasonmonths_map[vname])

            # get the climatology
            seas_to_clim = {seas: np.array(list(y_to_means.values())).mean(axis=0) for seas, y_to_means in
                            seas_to_year_to_mean.items()}

            sim_data[label][vname] = seas_to_clim

            if interp_indices is None:
                _, interp_indices = dm.get_kdtree().query(list(zip(xt, yt, zt)))

            season_to_std = {}
            mod_label_to_vname_to_season_to_std[label][vname] = season_to_std

            season_to_nobs = {}
            mod_label_to_vname_to_season_to_nobs[label][vname] = season_to_nobs

            for season in seas_to_clim:
                interpolated_field = seas_to_clim[season].flatten()[interp_indices].reshape(lons_t.shape)
                seas_to_clim[season] = interpolated_field

                # calculate standard deviations of the interpolated fields
                season_to_std[season] = np.asarray([field.flatten()[interp_indices].reshape(lons_t.shape) for field in
                                                    seas_to_year_to_mean[season].values()]).std(axis=0)

                # calculate numobs for the ttest
                season_to_nobs[season] = np.ones_like(lons_t) * len(seas_to_year_to_mean[season])



    # Plotting: interpolate to the same grid and plot obs and biases
    xx, yy = bsmap(lons_t, lats_t)
    lons_t[lons_t > 180] -= 360


    for vname in vars_of_interest:

        field_mask = maskoceans(lons_t, lats_t, np.zeros_like(lons_t), inlands=vname in [SWE]).mask
        field_mask_lakes = maskoceans(lons_t, lats_t, np.zeros_like(lons_t), inlands=True).mask

        plot_utils.apply_plot_params(width_cm=11 * len(vname_to_seasonmonths_map[vname]), height_cm=20, font_size=8)

        fig = plt.figure()



        nrows = len(sim_configs) + 1
        ncols = len(vname_to_seasonmonths_map[vname])
        gs = GridSpec(nrows=nrows, ncols=ncols)




        # plot the fields
        for current_row, sim_label in enumerate(sim_labels):
            for col, season in enumerate(vname_to_seasonmonths_map[vname]):

                field = sim_data[sim_label][vname][season]

                ax = fig.add_subplot(gs[current_row, col])

                if current_row == 0:
                    ax.set_title(season)

                clevs = get_clevs(vname)
                if clevs is not None:
                    bnorm = BoundaryNorm(clevs, len(clevs) - 1)
                    cmap = cm.get_cmap("viridis", len(clevs) - 1)
                else:
                    cmap = "viridis"
                    bnorm = None

                the_mask = field_mask_lakes if vname in [T_AIR_2M, TOTAL_PREC, SWE] else field_mask
                to_plot = np.ma.masked_where(the_mask, field) * internal_name_to_multiplier[vname]



                # temporary plot the actual values
                cs = bsmap.contourf(xx, yy, to_plot, ax=ax, levels=get_clevs(vname), cmap=cmap, norm=bnorm, extend="both")
                bsmap.drawcoastlines(linewidth=coastline_width)
                bsmap.colorbar(cs, ax=ax)

                if col == 0:
                    ax.set_ylabel("{}".format(sim_label))





        # plot differences between the fields
        for col, season in enumerate(vname_to_seasonmonths_map[vname]):

            field = sim_data[NEMO_LABEL][vname][season] - sim_data[HL_LABEL][vname][season]

            ax = fig.add_subplot(gs[-1, col])

            clevs = get_clevs(vname + "biasdiff")
            if clevs is not None:
                bnorm = BoundaryNorm(clevs, len(clevs) - 1)
                cmap = cm.get_cmap("bwr", len(clevs) - 1)
            else:
                cmap = "bwr"
                bnorm = None


            to_plot = field * internal_name_to_multiplier[vname]
            # to_plot = np.ma.masked_where(field_mask, field) * internal_name_to_multiplier[vname]



            # ttest
            a = sim_data[NEMO_LABEL][vname][season]  # Calculate the simulation data back from biases
            std_a = mod_label_to_vname_to_season_to_std[NEMO_LABEL][vname][season]
            nobs_a = mod_label_to_vname_to_season_to_nobs[NEMO_LABEL][vname][season]

            b = sim_data[HL_LABEL][vname][season]  # Calculate the simulation data back from biases
            std_b = mod_label_to_vname_to_season_to_std[HL_LABEL][vname][season]
            nobs_b = mod_label_to_vname_to_season_to_nobs[HL_LABEL][vname][season]


            t, p = ttest_ind_from_stats(mean1=a, std1=std_a, nobs1=nobs_a,
                                        mean2=b, std2=std_b, nobs2=nobs_b, equal_var=False)

            # Mask non-significant differences as given by the ttest
            to_plot = np.ma.masked_where(p > p_crit, to_plot)


            # mask the points with not sufficient land fraction
            if land_fraction is not None and vname in [SWE, ]:
                to_plot = np.ma.masked_where(land_fraction < 0.05, to_plot)


            # print("land fractions for large differences ", land_fraction[to_plot > 30])


            cs = bsmap.contourf(xx, yy, to_plot, ax=ax, extend="both", levels=get_clevs(vname + "biasdiff"), cmap=cmap, norm=bnorm)
            bsmap.drawcoastlines(linewidth=coastline_width)
            bsmap.colorbar(cs, ax=ax)

            if col == 0:
                ax.set_ylabel("{}\n-\n{}".format(NEMO_LABEL, HL_LABEL))


        fig.tight_layout()

        # save a figure per variable
        img_file = "seasonal_differences_noobs_{}_{}_{}-{}.png".format(vname,
                                                            "-".join([s for s in vname_to_seasonmonths_map[vname]]),
                                                            start_year, end_year)
        img_file = img_folder.joinpath(img_file)

        fig.savefig(str(img_file), dpi=300)

        plt.close(fig)
def main():
    img_folder = Path("nei_validation")

    if not img_folder.exists():
        img_folder.mkdir()



    pval_crit = 0.1

    var_names = ["TT", "PR"]
    # var_names = ["PR"]

    seasons = OrderedDict([
        ("DJF", MonthPeriod(12, 3)),
        ("MAM", MonthPeriod(3, 3)),
        ("JJA", MonthPeriod(6, 3)),
        ("SON", MonthPeriod(9, 3)),
    ])

    sim_paths = OrderedDict()


    start_year = 1980
    end_year = 2010

    sim_paths["WC_0.44deg_default"] = Path("/HOME/huziy/skynet3_rech1/CRCM5_outputs/NEI/diags/NEI_WC0.44deg_default/Diagnostics")
    sim_paths["WC_0.44deg_ctem+frsoil+dyngla"] = Path("/HOME/huziy/skynet3_rech1/CRCM5_outputs/NEI/diags/debug_NEI_WC0.44deg_Crr1/Diagnostics")
    sim_paths["WC_0.11deg_ctem+frsoil+dyngla"] = Path("/snow3/huziy/NEI/WC/NEI_WC0.11deg_Crr1/Diagnostics")



    # -- daymet monthly

    daymet_vname_to_path = {
        "prcp": "/HOME/data/Validation/Daymet/Monthly_means/NetCDF/daymet_v3_prcp_monttl_*_na.nc4",
        "tavg": "/HOME/huziy/skynet3_rech1/obs_data/daymet_tavg_monthly/daymet_v3_tavg_monavg_*_na_nc4classic.nc4"
    }


    vname_to_daymet_vname = {
        "PR": "prcp",
        "TT": "tavg"
    }



    plot_utils.apply_plot_params(font_size=14)


    basemap_for_obs = None
    # plot simulation data
    for sim_label, sim_path in sim_paths.items():


        manager_mod = DiagCrcmManager(data_dir=sim_path)



        for vname in var_names:

            daymet_vname = vname_to_daymet_vname[vname]
            manager_obs = HighResDataManager(path=daymet_vname_to_path[daymet_vname], vname=daymet_vname)

            seas_to_clim_mod = manager_mod.get_seasonal_means_with_ttest_stats(
                season_to_monthperiod=seasons,
                start_year=start_year, end_year=end_year, vname=vname,
                vertical_level=var_name_to_level[vname], data_file_prefix=var_name_to_file_prefix[vname]
            )

            seas_to_clim_obs = manager_obs.get_seasonal_means_with_ttest_stats_interpolated_to(
                manager_mod.lons, manager_mod.lats,
                season_to_monthperiod=seasons,
                start_year=start_year, end_year=end_year, convert_monthly_accumulators_to_daily=(vname == "PR")
            )


            season_to_diff = OrderedDict()
            season_to_summary_stats = OrderedDict()

            for season in seas_to_clim_mod:
                mod_mean, mod_std, mod_n = seas_to_clim_mod[season]
                obs_mean, obs_std, obs_n = seas_to_clim_obs[season]


                if vname == "PR":
                    # Convert model data to mm/day from M/s
                    mod_mean *= 1000 * 3600 * 24
                    mod_std *= 1000 * 3600 * 24


                tval, pval = ttest_ind_from_stats(mod_mean, mod_std, mod_n, obs_mean, obs_std, obs_n, equal_var=False)



                valid_points = ~(obs_mean.mask | np.isnan(obs_mean))
                mod_1d = mod_mean[valid_points]
                obs_1d = obs_mean[valid_points]

                rms = (((mod_1d - obs_1d) ** 2).sum() / len(mod_1d)) ** 0.5
                spat_corr, p_spat_corr = stats.pearsonr(mod_1d, obs_1d)

                season_to_summary_stats[season] = f"RMSE={rms:.1f}\nr={spat_corr:.2f}\nPVr={p_spat_corr:.2f}"



                season_to_diff[season] = []
                season_to_diff[season].append(np.ma.masked_where(pval >= pval_crit, mod_mean - obs_mean)) # mask not-significant biases
                season_to_diff[season].append(mod_std - obs_std)
                season_to_diff[season].append(-1)


            _plot_seasonal_deltas(
                seas_data=season_to_diff, data_label="{}_{}-{}".format(sim_label, start_year, end_year),
                img_dir=img_folder, map=manager_mod.get_basemap(resolution="i", area_thresh=area_thresh_km2),
                lons=manager_mod.lons, lats=manager_mod.lats, vname=vname,
                var_name_to_mul={"TT": 1, "PR": 1}, seas_to_stats=season_to_summary_stats
            )
Exemple #37
0
def main():

    # get the data for basemap
    crcm_data_path = "/RESCUE/skynet3_rech1/huziy/hdf_store/quebec_0.1_crcm5-hcd-rl.hdf5"
    bmp_info = analysis.get_basemap_info_from_hdf(file_path=crcm_data_path)

    season_key = "JJA"
    season_to_months = OrderedDict([(season_key, [6, 7, 8])])

    month_to_ndays = {m: _month_to_ndays(m) for m in range(1, 13)}

    #
    current_filepath = (
        "/RESCUE/skynet3_rech1/huziy/GCM_outputs/CanESM2/pr_Amon_CanESM2_historical_r1i1p1_185001-200512.nc"
    )
    future_filepath = "/RESCUE/skynet3_rech1/huziy/GCM_outputs/CanESM2/pr_Amon_CanESM2_rcp85_r1i1p1_200601-210012.nc"

    Period = namedtuple("Period", ["start_year", "end_year"])

    current = Period(start_year=1980, end_year=2010)
    future = Period(start_year=2070, end_year=2100)

    ds = xr.open_mfdataset([current_filepath, future_filepath])

    # select the season
    ds = ds.isel(time=ds["time.season"] == season_key)

    # select the data for the current and future periods
    years = ds["time.year"]
    pr_current = ds.isel(time=(years >= current.start_year) & (years <= current.end_year)).pr
    pr_future = ds.isel(time=(years >= future.start_year) & (years <= future.end_year)).pr

    assert isinstance(pr_current, xr.DataArray)

    weights_current = xr.DataArray(
        [month_to_ndays[m] for m in pr_current["time.month"].values], coords=[pr_current.time]
    )
    weights_current = weights_current / weights_current.sum()

    weights_future = xr.DataArray([month_to_ndays[m] for m in pr_future["time.month"].values], coords=[pr_future.time])
    weights_future = weights_future / weights_future.sum()

    # seasonal means
    pr_current_smean = (pr_current * weights_current).groupby("time.year").sum(dim="time")
    pr_future_smean = (pr_future * weights_future).groupby("time.year").sum(dim="time")

    # climatology and stds
    pr_current_clim = pr_current_smean.mean(dim="year")
    pr_current_std = pr_current_smean.std(dim="year")

    pr_future_clim = pr_future_smean.mean(dim="year")
    pr_future_std = pr_future_smean.std(dim="year")

    # calculate significance
    n_current = current.end_year - current.start_year + 1
    n_future = future.end_year - future.start_year + 1
    tval, pval = stats.ttest_ind_from_stats(
        pr_current_clim.values,
        pr_current_std.values,
        nobs1=n_current,
        mean2=pr_future_clim.values,
        std2=pr_future_std.values,
        nobs2=n_future,
    )

    print(weights_current[:3].values, weights_current[:3].sum())

    print(pr_current_smean.shape)

    print(pr_future.shape)
    print(pr_current.shape)
    print(ds["time.year"][-12:])

    # do the plotting
    plot_utils.apply_plot_params()
    fig = plt.figure()
    b = bmp_info.basemap
    xx, yy = bmp_info.get_proj_xy()

    lons, lats = np.meshgrid(ds.lon, ds.lat)

    xg, yg = b(lons, lats)

    dom_mask = (xg >= xx[0, 0]) & (xg <= xx[-1, -1]) & (yg >= yy[0, 0]) & (yg <= yy[-1, -1])

    i_list, j_list = np.where(dom_mask)

    imax, jmax = i_list.max(), j_list.max()
    imin, jmin = i_list.min(), j_list.min()

    marginx, marginy = 10, 10
    imax += marginx
    jmax += marginy
    imin -= marginx
    jmin -= marginy

    dom_mask[imin:imax, jmin:jmax] = True

    print(pr_current_clim.shape)
    print(ds.lon.shape)

    cchange = (pr_future_clim - pr_current_clim) * 24 * 3600  # Convert to mm/day

    cchange = np.ma.masked_where(~dom_mask, cchange)

    # cchange = np.ma.masked_where(pval > 0.1, cchange)

    plt.title("{}, (mm/day)".format(season_key))
    im = b.contourf(xg, yg, cchange)
    cb = b.colorbar(im)

    sign = np.ma.masked_where(~dom_mask, pval <= 0.05)

    cs = b.contourf(xg, yg, sign, levels=[0, 0.5, 1], hatches=["/", None, None], colors="none")

    b.drawcoastlines()

    # create a legend for the contour set
    artists, labels = cs.legend_elements()
    plt.legend([artists[0]], ["not sign. (pvalue > 0.05)"], handleheight=2)

    img_folder = "cc-paper-comments"
    fig.savefig(
        os.path.join(img_folder, "canesm_cc_{}_precip.png".format(season_key)),
        dpi=common_plot_params.FIG_SAVE_DPI / 2,
        bbox_inches="tight",
    )
    plt.show()
def main(vars_of_interest=None):
    # Validation with CRU (temp, precip) and CMC SWE

    # obs_data_path = Path("/RESCUE/skynet3_rech1/huziy/obs_data_for_HLES/interploated_to_the_same_grid/GL_0.1_452x260/anusplin+_interpolated_tt_pr.nc")
    obs_data_path = Path("/HOME/huziy/skynet3_rech1/obs_data/mh_churchill_nelson_obs_fields")
    CRU_PRECIP = True

    sim_id = "mh_0.44"
    add_shp_files = [
        default_domains.MH_BASINS_PATH,
        constants.upstream_station_boundaries_shp_path[sim_id]
    ]


    start_year = 1981
    end_year = 2009

    MODEL_LABEL =  "CRCM5 (0.44)"
    # critical p-value for the ttest aka significance level
    # p_crit = 0.05
    p_crit = 1

    coastlines_width = 0.3

    vars_of_interest_default = [
        # T_AIR_2M,
        TOTAL_PREC,
        # SWE,
        # LAKE_ICE_FRACTION
    ]

    if vars_of_interest is None:
        vars_of_interest = vars_of_interest_default


    vname_to_seasonmonths_map = {
        SWE: OrderedDict([("DJF", [12, 1, 2])]),
        T_AIR_2M: season_to_months,
        TOTAL_PREC: OrderedDict([("Annual", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]) # season_to_months,

    }

    sim_configs = {

        MODEL_LABEL: RunConfig(data_path="/RECH2/huziy/BC-MH/bc_mh_044deg/Samples",
                  start_year=start_year, end_year=end_year, label=MODEL_LABEL),

    }


    grid_config = default_domains.bc_mh_044




    sim_labels = [MODEL_LABEL, ]

    vname_to_level = {
        T_AIR_2M: VerticalLevel(1, level_kinds.HYBRID),
        U_WE: VerticalLevel(1, level_kinds.HYBRID),
        V_SN: VerticalLevel(1, level_kinds.HYBRID),
        SWE: VerticalLevel(-1, level_kinds.ARBITRARY)
    }

    vname_map = {
        default_varname_mappings.TOTAL_PREC: "pre",
        default_varname_mappings.T_AIR_2M: "tmp",
        default_varname_mappings.SWE: "SWE"
    }

    filename_prefix_mapping = {
        default_varname_mappings.SWE: "pm",
        default_varname_mappings.TOTAL_PREC: "pm",
        default_varname_mappings.T_AIR_2M: "dm"
    }


    # Try to get the land_fraction for masking if necessary
    land_fraction = None
    try:
        land_fraction = get_land_fraction(sim_configs[MODEL_LABEL])
    except Exception:
        pass



    # Calculations

    # prepare params for interpolation
    lons_t, lats_t, bsmap = get_target_lons_lats_basemap(sim_configs[MODEL_LABEL])

    bsmap, reg_of_interest_mask = grid_config.get_basemap_using_shape_with_polygons_of_interest(lons=lons_t, lats=lats_t,
                                                                                                shp_path=default_domains.MH_BASINS_PATH,
                                                                                                mask_margin=2, resolution="i")

    xt, yt, zt = lat_lon.lon_lat_to_cartesian(lons_t.flatten(), lats_t.flatten())










    obs_multipliers = default_varname_mappings.vname_to_multiplier_CRCM5.copy()

    # Read and calculate observed seasonal means
    store_config = {
            "base_folder": obs_data_path.parent if not obs_data_path.is_dir() else obs_data_path,
            "data_source_type": data_source_types.ALL_VARS_IN_A_FOLDER_IN_NETCDF_FILES_OPEN_EACH_FILE_SEPARATELY,
            "varname_mapping": vname_map,
            "level_mapping": vname_to_level,
            "offset_mapping": default_varname_mappings.vname_to_offset_CRCM5,
            "multiplier_mapping": obs_multipliers,
    }

    obs_dm = DataManager(store_config=store_config)
    obs_data = {}


    # need to save it for ttesting
    obs_vname_to_season_to_std = {}
    obs_vname_to_season_to_nobs = {}

    interp_indices = None
    for vname in vars_of_interest:
        # --
        end_year_for_current_var = end_year
        if vname == SWE:
            end_year_for_current_var = min(1996, end_year)

        # --
        seas_to_year_to_mean = obs_dm.get_seasonal_means(varname_internal=vname,
                                                     start_year=start_year,
                                                     end_year=end_year_for_current_var,
                                                     season_to_months=vname_to_seasonmonths_map[vname])





        seas_to_clim = {seas: np.array(list(y_to_means.values())).mean(axis=0) for seas, y_to_means in seas_to_year_to_mean.items()}

        # convert precip from mm/month (CRU) to mm/day
        if vname in [TOTAL_PREC] and CRU_PRECIP:
            for seas in seas_to_clim:
                seas_to_clim[seas] *= 1. / (365.25 / 12)
                seas_to_clim[seas] = np.ma.masked_where(np.isnan(seas_to_clim[seas]), seas_to_clim[seas])


                print("{}: min={}, max={}".format(seas, seas_to_clim[seas].min(), seas_to_clim[seas].max()))


        obs_data[vname] = seas_to_clim

        if interp_indices is None:
            _, interp_indices = obs_dm.get_kdtree().query(list(zip(xt, yt, zt)))




        # need for ttests
        season_to_std = {}
        obs_vname_to_season_to_std[vname] = season_to_std

        season_to_nobs = {}
        obs_vname_to_season_to_nobs[vname] = season_to_nobs

        for season in seas_to_clim:
            seas_to_clim[season] = seas_to_clim[season].flatten()[interp_indices].reshape(lons_t.shape)



            # save the yearly means for ttesting
            season_to_std[season] = np.asarray([field.flatten()[interp_indices].reshape(lons_t.shape)
                                                         for field in seas_to_year_to_mean[season].values()]).std(axis=0)


            season_to_nobs[season] = np.ones_like(lons_t) * len(seas_to_year_to_mean[season])


        plt.show()



    # Read and calculate simulated seasonal mean biases
    mod_label_to_vname_to_season_to_std = {}
    mod_label_to_vname_to_season_to_nobs = {}

    model_data_multipliers = defaultdict(lambda: 1)
    model_data_multipliers[TOTAL_PREC] = 1000 * 24 * 3600

    sim_data = defaultdict(dict)
    for label, r_config in sim_configs.items():

        store_config = {
                "base_folder": r_config.data_path,
                "data_source_type": data_source_types.SAMPLES_FOLDER_FROM_CRCM_OUTPUT,
                "varname_mapping": default_varname_mappings.vname_map_CRCM5,
                "level_mapping": vname_to_level,
                "offset_mapping": default_varname_mappings.vname_to_offset_CRCM5,
                "multiplier_mapping": model_data_multipliers,
                "filename_prefix_mapping": filename_prefix_mapping
        }


        dm = DataManager(store_config=store_config)

        mod_label_to_vname_to_season_to_std[label] = {}
        mod_label_to_vname_to_season_to_nobs[label] = {}


        interp_indices = None
        for vname in vars_of_interest:

            # --
            end_year_for_current_var = end_year
            if vname == SWE:
                end_year_for_current_var = min(1996, end_year)

            # --
            seas_to_year_to_mean = dm.get_seasonal_means(varname_internal=vname,
                                                         start_year=start_year,
                                                         end_year=end_year_for_current_var,
                                                         season_to_months=vname_to_seasonmonths_map[vname])


            # get the climatology
            seas_to_clim = {seas: np.array(list(y_to_means.values())).mean(axis=0) for seas, y_to_means in seas_to_year_to_mean.items()}

            sim_data[label][vname] = seas_to_clim



            if interp_indices is None:
                _, interp_indices = dm.get_kdtree().query(list(zip(xt, yt, zt)))


            season_to_std = {}
            mod_label_to_vname_to_season_to_std[label][vname] = season_to_std

            season_to_nobs = {}
            mod_label_to_vname_to_season_to_nobs[label][vname] = season_to_nobs

            for season in seas_to_clim:
                interpolated_field = seas_to_clim[season].flatten()[interp_indices].reshape(lons_t.shape)
                seas_to_clim[season] = interpolated_field - obs_data[vname][season]

                # calculate standard deviations of the interpolated fields
                season_to_std[season] = np.asarray([field.flatten()[interp_indices].reshape(lons_t.shape) for field in seas_to_year_to_mean[season].values()]).std(axis=0)

                # calculate numobs for the ttest
                season_to_nobs[season] = np.ones_like(lons_t) * len(seas_to_year_to_mean[season])






    xx, yy = bsmap(lons_t, lats_t)
    lons_t[lons_t > 180] -= 360

    field_mask = maskoceans(lons_t, lats_t, np.zeros_like(lons_t)).mask


    for vname in vars_of_interest:

        if vname not in [SWE]:
            field_mask = np.zeros_like(field_mask, dtype=bool)


        # Plotting: interpolate to the same grid and plot obs and biases
        plot_utils.apply_plot_params(width_cm=32 / 4 * (len(vname_to_seasonmonths_map[vname])),
                                     height_cm=25 / 3.0 * (len(sim_configs) + 1), font_size=8 * len(vname_to_seasonmonths_map[vname]))

        fig = plt.figure()

        # fig.suptitle(internal_name_to_title[vname] + "\n")

        nrows = len(sim_configs) + 2
        ncols = len(vname_to_seasonmonths_map[vname])
        gs = GridSpec(nrows=nrows, ncols=ncols)



        # Plot the obs fields
        current_row = 0
        for col, season in enumerate(vname_to_seasonmonths_map[vname]):
            field = obs_data[vname][season]
            ax = fig.add_subplot(gs[current_row, col])
            ax.set_title(season)

            to_plot = np.ma.masked_where(field_mask, field) * internal_name_to_multiplier[vname]
            clevs = get_clevs(vname)

            to_plot = np.ma.masked_where(~reg_of_interest_mask, to_plot)

            if clevs is not None:
                bnorm = BoundaryNorm(clevs, len(clevs) - 1)
                cmap = cm.get_cmap("Blues", len(clevs) - 1)
            else:
                cmap = "jet"
                bnorm = None

            bsmap.drawmapboundary(fill_color="0.75")

            # cs = bsmap.contourf(xx, yy, to_plot, ax=ax, levels=get_clevs(vname), norm=bnorm, cmap=cmap)
            cs = bsmap.pcolormesh(xx, yy, to_plot, ax=ax, norm=bnorm, cmap=internal_name_to_cmap[vname])

            bsmap.drawcoastlines(linewidth=coastlines_width)
            # bsmap.drawstates(linewidth=0.1)
            # bsmap.drawcountries(linewidth=0.2)
            bsmap.colorbar(cs, ax=ax)

            i = 0
            bsmap.readshapefile(str(add_shp_files[i])[:-4], "field_{}".format(i), linewidth=0.5, color="m")


            if col == 0:
                ax.set_ylabel("Obs")



        # plot the biases
        for sim_label in sim_labels:
            current_row += 1
            for col, season in enumerate(vname_to_seasonmonths_map[vname]):

                field = sim_data[sim_label][vname][season]

                ax = fig.add_subplot(gs[current_row, col])

                clevs = get_clevs(vname + "bias")
                if clevs is not None:
                    bnorm = BoundaryNorm(clevs, len(clevs) - 1)
                    cmap = cm.get_cmap("bwr", len(clevs) - 1)
                else:
                    cmap = "bwr"
                    bnorm = None

                to_plot = np.ma.masked_where(field_mask, field) * internal_name_to_multiplier[vname]


                # ttest
                a = sim_data[sim_label][vname][season] + obs_data[vname][season]  # Calculate the simulation data back from biases
                std_a = mod_label_to_vname_to_season_to_std[sim_label][vname][season]
                nobs_a = mod_label_to_vname_to_season_to_nobs[sim_label][vname][season]

                b = obs_data[vname][season]
                std_b =  obs_vname_to_season_to_std[vname][season]
                nobs_b = obs_vname_to_season_to_nobs[vname][season]



                t, p = ttest_ind_from_stats(mean1=a, std1=std_a, nobs1=nobs_a,
                                            mean2=b, std2=std_b, nobs2=nobs_b, equal_var=False)

                # Mask non-significant differences as given by the ttest
                to_plot = np.ma.masked_where(p > p_crit, to_plot)

                # only focus on the basins of interest
                to_plot = np.ma.masked_where(~reg_of_interest_mask, to_plot)


                # cs = bsmap.contourf(xx, yy, to_plot, ax=ax, extend="both", levels=get_clevs(vname + "bias"), cmap=cmap, norm=bnorm)

                bsmap.drawmapboundary(fill_color="0.75")


                cs = bsmap.pcolormesh(xx, yy, to_plot, ax=ax, cmap=cmap, norm=bnorm)
                bsmap.drawcoastlines(linewidth=coastlines_width)
                bsmap.colorbar(cs, ax=ax, extend="both")





                for i, shp in enumerate(add_shp_files[1:], start=1):
                    bsmap.readshapefile(str(shp)[:-4], "field_{}".format(i), linewidth=0.5, color="k")

                if col == 0:
                    ax.set_ylabel("{}\n-\nObs.".format(sim_label))




        fig.tight_layout()



        # save a figure per variable
        img_file = "seasonal_biases_{}_{}_{}-{}.png".format(vname,
                                                            "-".join([s for s in vname_to_seasonmonths_map[vname]]),
                                                            start_year, end_year)


        if not img_folder.exists():
            img_folder.mkdir(parents=True)

        img_file = img_folder / img_file
        fig.savefig(str(img_file), bbox_inches="tight", dpi=300)

        plt.close(fig)
def main():

    obs_data_path = Path("/RESCUE/skynet3_rech1/huziy/obs_data_for_HLES/interploated_to_the_same_grid/GL_0.1_452x260/anusplin+_interpolated_tt_pr.nc")

    start_year = 1980
    end_year = 2010

    HL_LABEL = "CRCM5_HL"
    NEMO_LABEL = "CRCM5_NEMO"

    # critical p-value for the ttest aka significance level
    p_crit = 0.1

    vars_of_interest = [
 #       T_AIR_2M,
 #       TOTAL_PREC,
 #       SWE,
        LAKE_ICE_FRACTION
    ]

    coastline_width = 0.3


    vname_to_seasonmonths_map = {
        SWE: OrderedDict([("November", [11]),
                          ("December", [12]),
                          ("January", [1,])]),
        LAKE_ICE_FRACTION: OrderedDict([
                         ("February", [2,]),
                          ("March", [3, ]),]),
        T_AIR_2M: season_to_months,
        TOTAL_PREC:  OrderedDict([
            ("Winter", [12, 1, 2]),
            ("Summer", [6, 7, 8]),
        ])
    }

    sim_configs = {

        HL_LABEL: RunConfig(data_path="/RECH2/huziy/coupling/GL_440x260_0.1deg_GL_with_Hostetler/Samples_selected",
                  start_year=start_year, end_year=end_year, label=HL_LABEL),

        NEMO_LABEL: RunConfig(data_path="/RECH2/huziy/coupling/coupled-GL-NEMO1h_30min/selected_fields",
                  start_year=start_year, end_year=end_year, label=NEMO_LABEL),
    }

    sim_labels = [HL_LABEL, NEMO_LABEL]

    vname_to_level = {
        T_AIR_2M: VerticalLevel(1, level_kinds.HYBRID),
        U_WE: VerticalLevel(1, level_kinds.HYBRID),
        V_SN: VerticalLevel(1, level_kinds.HYBRID),
    }


    # Try to get the land_fraction for masking if necessary
    land_fraction = None
    try:
        first_ts_file = Path(sim_configs[HL_LABEL].data_path).parent / "pm1979010100_00000000p"

        land_fraction = get_land_fraction(first_timestep_file=first_ts_file)
    except Exception as err:
        raise err
        pass



    # Calculations

    # prepare params for interpolation
    lons_t, lats_t, bsmap = get_target_lons_lats_basemap(sim_configs[HL_LABEL])

    # get a subdomain of the simulation domain
    nx, ny = lons_t.shape
    iss = IndexSubspace(i_start=20, j_start=20, i_end=nx // 2, j_end=ny/2)
    # just to change basemap limits
    lons_t, lats_t, bsmap = get_target_lons_lats_basemap(sim_configs[HL_LABEL], sub_space=iss, resolution="i", area_thresh=2000)


    xt, yt, zt = lat_lon.lon_lat_to_cartesian(lons_t.flatten(), lats_t.flatten())


    vname_map = {}
    vname_map.update(default_varname_mappings.vname_map_CRCM5)



    # Read and calculate observed seasonal means
    store_config = {
            "base_folder": obs_data_path.parent,
            "data_source_type": data_source_types.ALL_VARS_IN_A_FOLDER_IN_NETCDF_FILES_OPEN_EACH_FILE_SEPARATELY,
            "varname_mapping": vname_map,
            "level_mapping": vname_to_level,
            "offset_mapping": default_varname_mappings.vname_to_offset_CRCM5,
            "multiplier_mapping": default_varname_mappings.vname_to_multiplier_CRCM5,
    }

    obs_dm = DataManager(store_config=store_config)
    obs_data = {}


    # need to save it for ttesting
    obs_vname_to_season_to_std = {}
    obs_vname_to_season_to_nobs = {}

    interp_indices = None
    for vname in vars_of_interest:
        # --
        end_year_for_current_var = end_year
        if vname == SWE:
            end_year_for_current_var = min(1996, end_year)

        # --
        seas_to_year_to_mean = obs_dm.get_seasonal_means(varname_internal=vname,
                                                     start_year=start_year,
                                                     end_year=end_year_for_current_var,
                                                     season_to_months=vname_to_seasonmonths_map[vname])



        seas_to_clim = {seas: np.array(list(y_to_means.values())).mean(axis=0) for seas, y_to_means in seas_to_year_to_mean.items()}
        obs_data[vname] = seas_to_clim

        if interp_indices is None:
            _, interp_indices = obs_dm.get_kdtree().query(list(zip(xt, yt, zt)))




        # need for ttests
        season_to_std = {}
        obs_vname_to_season_to_std[vname] = season_to_std

        season_to_nobs = {}
        obs_vname_to_season_to_nobs[vname] = season_to_nobs

        for season in seas_to_clim:
            seas_to_clim[season] = seas_to_clim[season].flatten()[interp_indices].reshape(lons_t.shape)


            # save the yearly means for ttesting
            season_to_std[season] = np.asarray([field.flatten()[interp_indices].reshape(lons_t.shape)
                                                         for field in seas_to_year_to_mean[season].values()]).std(axis=0)


            season_to_nobs[season] = np.ones_like(lons_t) * len(seas_to_year_to_mean[season])




    # Read and calculate simulated seasonal mean biases
    mod_label_to_vname_to_season_to_std = {}
    mod_label_to_vname_to_season_to_nobs = {}

    sim_data = defaultdict(dict)
    for label, r_config in sim_configs.items():

        store_config = {
                "base_folder": r_config.data_path,
                "data_source_type": data_source_types.SAMPLES_FOLDER_FROM_CRCM_OUTPUT_VNAME_IN_FNAME,
                "varname_mapping": vname_map,
                "level_mapping": vname_to_level,
                "offset_mapping": default_varname_mappings.vname_to_offset_CRCM5,
                "multiplier_mapping": default_varname_mappings.vname_to_multiplier_CRCM5,
        }


        dm = DataManager(store_config=store_config)

        mod_label_to_vname_to_season_to_std[label] = {}
        mod_label_to_vname_to_season_to_nobs[label] = {}


        interp_indices = None
        for vname in vars_of_interest:

            # --
            end_year_for_current_var = end_year
            if vname == SWE:
                end_year_for_current_var = min(1996, end_year)

            # --
            seas_to_year_to_mean = dm.get_seasonal_means(varname_internal=vname,
                                                         start_year=start_year,
                                                         end_year=end_year_for_current_var,
                                                         season_to_months=vname_to_seasonmonths_map[vname])


            # get the climatology
            seas_to_clim = {seas: np.array(list(y_to_means.values())).mean(axis=0) for seas, y_to_means in seas_to_year_to_mean.items()}

            sim_data[label][vname] = seas_to_clim

            if interp_indices is None:
                _, interp_indices = dm.get_kdtree().query(list(zip(xt, yt, zt)))


            season_to_std = {}
            mod_label_to_vname_to_season_to_std[label][vname] = season_to_std

            season_to_nobs = {}
            mod_label_to_vname_to_season_to_nobs[label][vname] = season_to_nobs

            for season in seas_to_clim:
                interpolated_field = seas_to_clim[season].flatten()[interp_indices].reshape(lons_t.shape)
                seas_to_clim[season] = interpolated_field - obs_data[vname][season]

                # calculate standard deviations of the interpolated fields
                season_to_std[season] = np.asarray([field.flatten()[interp_indices].reshape(lons_t.shape) for field in seas_to_year_to_mean[season].values()]).std(axis=0)

                # calculate numobs for the ttest
                season_to_nobs[season] = np.ones_like(lons_t) * len(seas_to_year_to_mean[season])



    # Plotting: interpolate to the same grid and plot obs and biases



    xx, yy = bsmap(lons_t, lats_t)
    lons_t[lons_t > 180] -= 360


    draw_only_first_sim_biases = True
    for vname in vars_of_interest:

        field_mask = maskoceans(lons_t, lats_t, np.zeros_like(lons_t), inlands=vname in [SWE]).mask
        field_mask_lakes = maskoceans(lons_t, lats_t, np.zeros_like(lons_t), inlands=True).mask

        nrows = len(sim_configs) + 2 - 1 * int(draw_only_first_sim_biases)
        ncols = len(vname_to_seasonmonths_map[vname])

        plot_utils.apply_plot_params(width_cm=8 * len(vname_to_seasonmonths_map[vname]), height_cm=4.5 * nrows, font_size=8)
        fig = plt.figure()



        gs = GridSpec(nrows=nrows, ncols=ncols, hspace=0.2, wspace=0.02)

        extend = "both" if vname not in [LAKE_ICE_FRACTION] else "neither"

        # Plot the obs fields
        current_row = 0
        for col, season in enumerate(vname_to_seasonmonths_map[vname]):
            field = obs_data[vname][season]
            ax = fig.add_subplot(gs[current_row, col])
            # ax.set_title(season)


            the_mask = field_mask_lakes if vname in [T_AIR_2M, TOTAL_PREC, SWE] else field_mask
            to_plot = np.ma.masked_where(the_mask, field) * internal_name_to_multiplier[vname]
            clevs = get_clevs(vname)

            if clevs is not None:
                bnorm = BoundaryNorm(clevs, len(clevs) - 1)
                cmap = cm.get_cmap("viridis", len(clevs) - 1)
            else:
                cmap = "viridis"
                bnorm = None

            cs = bsmap.contourf(xx, yy, to_plot, ax=ax, levels=clevs, norm=bnorm, cmap=cmap)
            bsmap.drawcoastlines(linewidth=coastline_width)
            cb = bsmap.colorbar(cs, ax=ax, location="bottom")

            ax.set_frame_on(vname not in [LAKE_ICE_FRACTION, ])

            cb.ax.set_visible(col == 0)

            if col == 0:
                ax.set_ylabel("Obs")



        # plot the biases
        for sim_label in sim_labels:
            current_row += 1
            for col, season in enumerate(vname_to_seasonmonths_map[vname]):

                field = sim_data[sim_label][vname][season]

                ax = fig.add_subplot(gs[current_row, col])

                clevs = get_clevs(vname + "bias")
                if clevs is not None:
                    bnorm = BoundaryNorm(clevs, len(clevs) - 1)
                    cmap = cm.get_cmap("bwr", len(clevs) - 1)
                else:
                    cmap = "bwr"
                    bnorm = None

                the_mask = field_mask_lakes if vname in [T_AIR_2M, TOTAL_PREC, SWE] else field_mask
                to_plot = np.ma.masked_where(the_mask, field) * internal_name_to_multiplier[vname]


                # ttest
                a = sim_data[sim_label][vname][season] + obs_data[vname][season]  # Calculate the simulation data back from biases
                std_a = mod_label_to_vname_to_season_to_std[sim_label][vname][season]
                nobs_a = mod_label_to_vname_to_season_to_nobs[sim_label][vname][season]

                b = obs_data[vname][season]
                std_b =  obs_vname_to_season_to_std[vname][season]
                nobs_b = obs_vname_to_season_to_nobs[vname][season]



                t, p = ttest_ind_from_stats(mean1=a, std1=std_a, nobs1=nobs_a,
                                            mean2=b, std2=std_b, nobs2=nobs_b, equal_var=False)

                # Mask non-significant differences as given by the ttest
                to_plot = np.ma.masked_where(p > p_crit, to_plot)


                # temporary plot the actual values

                cs = bsmap.contourf(xx, yy, to_plot, ax=ax, extend=extend, levels=get_clevs(vname + "bias"), cmap=cmap, norm=bnorm)
                bsmap.drawcoastlines(linewidth=coastline_width)
                cb = bsmap.colorbar(cs, ax=ax, location="bottom")

                ax.set_frame_on(vname not in [LAKE_ICE_FRACTION, ])
                cb.ax.set_visible(False)

                if col == 0:
                    ax.set_ylabel("{}\n-\nObs.".format(sim_label))

            # draw biases only for the first simulation
            if draw_only_first_sim_biases:
                break


        # plot differences between the biases
        current_row += 1
        for col, season in enumerate(vname_to_seasonmonths_map[vname]):

            field = sim_data[NEMO_LABEL][vname][season] - sim_data[HL_LABEL][vname][season]

            ax = fig.add_subplot(gs[current_row, col])

            clevs = get_clevs(vname + "bias")
            if clevs is not None:
                bnorm = BoundaryNorm(clevs, len(clevs) - 1)
                cmap = cm.get_cmap("bwr", len(clevs) - 1)
            else:
                cmap = "bwr"
                bnorm = None


            to_plot = field * internal_name_to_multiplier[vname]
            # to_plot = np.ma.masked_where(field_mask, field) * internal_name_to_multiplier[vname]



            # ttest
            a = sim_data[NEMO_LABEL][vname][season] + obs_data[vname][season]  # Calculate the simulation data back from biases
            std_a = mod_label_to_vname_to_season_to_std[NEMO_LABEL][vname][season]
            nobs_a = mod_label_to_vname_to_season_to_nobs[NEMO_LABEL][vname][season]

            b = sim_data[HL_LABEL][vname][season] + obs_data[vname][season]  # Calculate the simulation data back from biases
            std_b = mod_label_to_vname_to_season_to_std[HL_LABEL][vname][season]
            nobs_b = mod_label_to_vname_to_season_to_nobs[HL_LABEL][vname][season]


            t, p = ttest_ind_from_stats(mean1=a, std1=std_a, nobs1=nobs_a,
                                        mean2=b, std2=std_b, nobs2=nobs_b, equal_var=False)

            # Mask non-significant differences as given by the ttest
            to_plot = np.ma.masked_where(p > p_crit, to_plot)


            # mask the points with not sufficient land fraction
            if land_fraction is not None and vname in [SWE, ]:
                to_plot = np.ma.masked_where(land_fraction < 0.1, to_plot)


            # print("land fractions for large differences ", land_fraction[to_plot > 30])


            cs = bsmap.contourf(xx, yy, to_plot, ax=ax, extend=extend, levels=clevs, cmap=cmap, norm=bnorm)
            bsmap.drawcoastlines(linewidth=coastline_width)
            cb = bsmap.colorbar(cs, ax=ax, location="bottom")

            ax.text(0.99, 1.1, season, va="top", ha="right", fontsize=16, transform=ax.transAxes)

            cb.ax.set_visible(col == 0)

            assert isinstance(ax, Axes)
            ax.set_frame_on(False)

            if col == 0:
                ax.set_ylabel("{}\n-\n{}".format(NEMO_LABEL, HL_LABEL))


        # fig.tight_layout()

        # save a figure per variable
        img_file = "seasonal_biases_{}_{}_{}-{}.png".format(vname,
                                                            "-".join([s for s in vname_to_seasonmonths_map[vname]]),
                                                            start_year, end_year)
        img_file = img_folder.joinpath(img_file)

        fig.savefig(str(img_file), dpi=300, bbox_inches="tight")

        plt.close(fig)
Exemple #40
0
from scipy.stats import ttest_ind_from_stats
df = pd.DataFrame([fishmeans, birdmeans, birddropmeans])
dfstd = pd.DataFrame([fishstds, birdstds, birddropstds])
df.columns = ['TT','TF','FT', 'FF']
df.index = ['fish', 'bird','!bird']
dfstd.columns = ['TT','TF','FT', 'FF']
dfstd.index = ['fish', 'bird','!bird']
print '12 samples'
print 'eigen-neighbors'

ttest = []
for i in range(len(df)):
    for j, col in enumerate(df.columns):
        for k in df.columns:
            p = ttest_ind_from_stats(df[col][i], dfstd[col][i], 12, df[k][i], dfstd[k][i],12)[1]
            ttest.append([df.index[i], col, k, p])
            #print '{0}, {1},{2},{3}'.format(*ttest[-1])
ttest = pd.DataFrame(ttest)
ttest.columns = ['word','eigen-neigbor 1', 'eigen-neigbor 2', 'p']
ttest.sort('p')
#ttest_ind_from_stats(df['TT']['fish'], dfstd['TT']['fish'], 12, df['TF']['fish'], dfstd['TF']['fish'],12)




    def print_def(self, metric, rank = 0, col = 'def'):
        '''
        would need to pass in a dataframe.  self.conx deprecated
        '''
#         #print a