Exemple #1
0
def run(esetSC2, anoSC2, Control_exp, sPTD_exp_sample, PPROM_exp_sample,
        maxGeneCount):
    sPTD_log2FC = [
        math.log2(np.mean(Control_exp.iloc[c])) -
        math.log2(np.mean(sPTD_exp_sample.iloc[c]))
        for c in range(Control_exp.shape[0])
    ]
    PPROM_log2FC = [
        math.log2(np.mean(Control_exp.iloc[c])) -
        math.log2(np.mean(PPROM_exp_sample.iloc[c]))
        for c in range(Control_exp.shape[0])
    ]

    sPTD_neglog10pval = [
        -math.log10(x)
        for x in ttest(Control_exp, sPTD_exp_sample, axis=1).pvalue
    ]
    PPROM_neglog10pval = [
        -math.log10(x)
        for x in ttest(Control_exp, PPROM_exp_sample, axis=1).pvalue
    ]

    sPTDDiffGenes = runDiffGenes(esetSC2, maxGeneCount, 'Control_vs_sPTD',
                                 sPTD_log2FC, sPTD_neglog10pval)
    PPROMDiffGenes = runDiffGenes(esetSC2, maxGeneCount, 'Control_vs_PPROM',
                                  PPROM_log2FC, PPROM_neglog10pval)
    return sPTDDiffGenes, PPROMDiffGenes
Exemple #2
0
 def run(self, num_iters: int):
     """
     :param num_iters: Number of iterations we want to run the experiment
     :return: A dictionary with t-test results on various metrics between the random graphs with and without sex-ed
     """
     without_taus, without_ginis, without_freemans, without_mgs = [], [], [], []
     with_taus, with_ginis, with_freemans, with_mgs = [], [], [], []
     for _ in range(num_iters):
         result = self.__run_one_iteration()
         without_taus.append(result['without']['tau'])
         without_ginis.append(result['without']['gini'])
         without_freemans.append(result['without']['freeman'])
         without_mgs.append(result['without']['mg'])
         with_taus.append(result['with']['tau'])
         with_ginis.append(result['with']['gini'])
         with_freemans.append(result['with']['freeman'])
         with_mgs.append(result['with']['mg'])
     ttest_results_tau = ttest(without_taus, with_taus)
     ttest_results_gini = ttest(without_ginis, with_ginis)
     ttest_results_freeman = ttest(without_freemans, with_freemans)
     ttest_results_mgs = ttest(without_mgs, with_mgs)
     return {
         'tau': ttest_results_tau,
         'gini': ttest_results_gini,
         'freeman': ttest_results_freeman,
         'mgs': ttest_results_mgs
     }
Exemple #3
0
def tt(A, B):
    f_p = f(A, B).pvalue
    if f_p <= 0.05:
        t_p = ttest(A, B, equal_var=False).pvalue
    elif f_p > 0.05:
        t_p = ttest(A, B, equal_var=True).pvalue
    return t_p
Exemple #4
0
def surface_glm_data(df, marker='coef_', output='t_stat'):
    '''
    Input: beta-weights averaged per label.

    1. Average across sessions
    2. Average / Subtract for mean response, lateral response, etc...
    3. ttest across subjects

    - Arguments:
        a) concatenated beta-weight data in a pd.DataFrame with the columns
            subject, session, parameter, hemisphere, cortical label, value
        b) parameters that shall be lateralized (e.g. response)
        c) columns name of beta-weight values (normally 'coef_')
        d) output t-statistic or p-value
    '''
    if output == 't_stat':
        p_or_t = 0
    elif output == 'p_val':
        p_or_t = 1
    ses_mean = df.groupby(['subject', 'parameter', 'names', 'hemisphere']).\
        mean().reset_index()                                                   # !! average across sessions per subject
    mean_response = ses_mean.loc[ses_mean.parameter.
                                 isin(['response_left_rule_resp_A',
                                       'response_left_rule_resp_B',
                                       'response_right_rule_resp_A',
                                       'response_right_rule_resp_B'])].\
        groupby(['subject', 'names', 'hemisphere']).mean().reset_index()        # response average
    mean_response['parameter'] = 'response_average'

    response_left_minus_right = ses_mean.loc[ses_mean.parameter.isin
                                             (['response_left_rule_resp_A',
                                               'response_left_rule_resp_B'])].\
        groupby(['subject', 'names', 'hemisphere']).mean() -\
        ses_mean.loc[ses_mean.parameter.isin(['response_right_rule_resp_A',
                                              'response_right_rule_resp_B'])].\
        groupby(['subject', 'names', 'hemisphere']).mean()
    response_left_minus_right = response_left_minus_right.reset_index()
    response_right_minus_left = response_left_minus_right.copy()
    response_right_minus_left.coef_ = -response_right_minus_left.coef_
    response_left_minus_right['parameter'] = 'response_left-right'
    response_right_minus_left['parameter'] = 'response_right-left'              # response subtractions
    ses_mean = pd.concat([ses_mean, mean_response,
                          response_left_minus_right,
                          response_right_minus_left], sort=False)
    ses_mean.to_hdf('/Users/kenohagena/Desktop/test.hdf', key='test')
    difference = (ses_mean.loc[ses_mean.hemisphere == 'R'].set_index(['parameter', 'names', 'subject']).drop('hemisphere', axis=1) -
            ses_mean.loc[ses_mean.hemisphere == 'L'].set_index(['parameter', 'names', 'subject']).drop('hemisphere', axis=1)).reset_index()
    difference = difference.groupby(['parameter', 'names']).\
        agg(lambda x: ttest(x, 0)[p_or_t]).reset_index()
    mag = ses_mean.groupby(['parameter', 'names', 'hemisphere']).\
        agg(lambda x: ttest(x, 0)[p_or_t]).reset_index()                        # !! t-test across across subjects
    average = mag.groupby(['parameter', 'names']).mean().reset_index()          # !! average across hemispheres
    average = average.pivot(columns='parameter', index='names',
                            values=marker)
    left_H = mag.loc[mag.hemisphere == 'L'].\
        pivot(columns='parameter', index='names', values=marker)
    right_H = mag.loc[mag.hemisphere == 'R'].\
        pivot(columns='parameter', index='names', values=marker)
    difference = difference.pivot(columns = 'parameter', index = 'names', values=marker)
    return average, left_H, right_H, difference
Exemple #5
0
def surface_plot_data(grouped_df, lateral_params, marker='coef_'):
    '''
    1. Average across sessions
    2a. Ttest per parameter, roi & hemisphere
    2b. Average across hemispheres
    --> magnitude values of coef_ per ROI
    3a. difference between hemispheres (kontra - ipsi)
    3b. average across conditions (response left & right)
    3c. ttest across subjects.
    --> lateralization values of coef_ for response / rresp
    '''
    df = grouped_df
    ses_mean = df.groupby(['subject', 'parameter', 'names',
                           'hemisphere']).mean().reset_index()
    mag = ses_mean.groupby(['parameter', 'names', 'hemisphere'
                            ]).agg(lambda x: ttest(x, 0)[1]).reset_index()
    mag = mag.groupby(['parameter', 'names']).mean().reset_index()
    mag = mag.pivot(columns='parameter', index='names', values=marker)

    for lateral_param in lateral_params:
        lat = ses_mean.loc[ses_mean.parameter.isin([
            '{}_left'.format(lateral_param), '{}_right'.format(lateral_param)
        ])]
        lat.set_index(['subject', 'parameter', 'names', 'hemisphere', 'labs'],
                      inplace=True)
        lat = lat.groupby(['parameter'],
                          group_keys=False).apply(lateralize).reset_index()
        lat = lat.groupby(['names', 'subject']).mean().reset_index()
        lat = lat.groupby(['names'
                           ]).agg(lambda x: ttest(x, 0)[1]).reset_index()
        mag['{}_lat'.format(lateral_param)] = lat[marker].values
    return mag
Exemple #6
0
 def kmeans(self,unscaled_data,k,random_state=42):
 """
 Get feature mean and feature description for each cluster
 Params:
     unscaled_data: pandas dataframe of shape (n_samples, n_features) for data with outlier removed and not scaled
     k: number of clusters
     random_state: random_state of kmeans
 Returns: num_out, sum2
 """    
     kmean = KMeans(n_clusters=k, random_state=random_state)
     k_cluster = kmean.fit_predict(self.data)
     
     unscaled_data['Cluster'] = k_cluster
     num_out = unscaled_data.groupby('Cluster').mean()
     
     sum2 = pd.DataFrame(np.zeros(unscaled_data.groupby('Cluster').mean().shape),columns = unscaled_data.groupby('Cluster').mean().columns, index = unscaled_data.groupby('Cluster').mean().index)
     for i in range(unscaled_data.groupby('Cluster').mean().shape[0]):
         for j in range(unscaled_data.groupby('Cluster').mean().shape[1]):
             var = unscaled_data.columns[j] 
             if (ttest(unscaled_data.loc[unscaled_data['Cluster']==i,var],unscaled_data.loc[:,var],equal_var=False)[1] <0.001) and (unscaled_data.loc[unscaled_data['Cluster']==i,var].mean() > unscaled_data.loc[:,var].mean()):
                 sum2.iloc[i,j] = 'High'
             elif (ttest(unscaled_data.loc[unscaled_data['Cluster']==i,var],unscaled_data.loc[:,var],equal_var=False)[1] <0.001) and (unscaled_data.loc[unscaled_data['Cluster']==i,var].mean() < unscaled_data.loc[:,var].mean()):
                 sum2.iloc[i,j] = 'Low'
             else:
                 sum2.iloc[i,j] = 'Average'
     
     return num_out,sum2
Exemple #7
0
def run_ttest(calls, prty):
    call_list = _filter_priority(calls, prty)

    return pd.Series([
        ttest(count_it(call_list, 2015, prty), count_it(call_list, 2016,
                                                        prty))[1],
        ttest(count_it(call_list, 2016, prty), count_it(call_list, 2017,
                                                        prty))[1]
    ],
                     index=[2016, 2017],
                     name=prty)
def experiment(true_exp_mean:float, true_control_mean:float, inter_day_SD:float, intra_day_SD:float, N_clusters:int, N_per_cluster:int, data_method:str = 'pool', ttest_method:bool = True, show_figure:bool = False):
    """ This module generates data and does the processing
        There are several types of processing
        By default it is use simple t-test on pooled data (ignore clustering)

    INPUT:  1) the parameters for data generating
            2) data_method = {‘pool’, ‘cluster_means’}, optional
                   choose the type of data to process furter
                   ( if 'pool', use the pooled data
                     elif 'cluster_means' use the means of clusters )
            2) ttest_method: bool, optional
                   choose what type of ttest to apply
                   ( if True, use simple t-test
                     else use the adjusted t-test )
            3) figure_show: bool, optional
                   decide if you want to see the figure of your data
                   by default it's off

    OUTPUT: hypothesis and p-value of experiment result

    EXAMPLE_OF_USE: experiment(1, 1, 0.1, 0.2, 3, 5) """

    # generate a matrix of data
    data_exp = generate_data(true_exp_mean, inter_day_SD, intra_day_SD, N_clusters, N_per_cluster)
    data_control = generate_data(true_control_mean, inter_day_SD, intra_day_SD, N_clusters, N_per_cluster)
    # do the processing
    #process_data()      FIXME
    #ipdb.set_trace()
    if data_method == 'pool': # use pooled data for processing
        data_exp_pooled = data_exp.reshape(-1).tolist() # pool the data into a list
        data_control_pooled = data_control.reshape(-1).tolist()
        #print(data_exp, data_control)
        if ttest_method:
            t, p_value = ttest(data_exp_pooled, data_control_pooled) # use simple t-test
        else: # use adjusted t-test
            t, p_value = adj_ttest(N_per_cluster, N_clusters, inter_day_SD, intra_day_SD, data_exp_pooled, data_control_pooled)
    elif data_method == 'cluster_means':# use means of clusters for processing
        data_exp_mean = data_exp.mean(axis=0)
        data_control_mean = data_control.mean(axis=0)
        if ttest_method:
            t, p_value = ttest(data_exp_mean, data_control_mean) # calculate t-test and check a hypothesis
        else:
            print('can\'t do adjusted t-test on means of clusters. Need pooled data')
            return
    # display data
    if show_figure:
        display_data(data_exp, data_control, N_clusters, N_per_cluster)

    return t, p_value
Exemple #9
0
def make_comparisons(df, fi, cond_di, out_path):
    '''
    Open comparison file, where each line is the comparison to do, for example,
    to compare treatmentA to control, the comparison file line would be:

    treatmentA    control

    Comparison divides avgerage treatmentA value by avgerage control value
    '''
    with open(fi, 'r') as f:
        li = [l for l in f.read().split('\n') if l]
    tdf = pd.DataFrame()
    for comp in li:
        cdf = pd.DataFrame()
        n, d = comp.split(',')
        name = 'vs'.join([n, d])
        # Calculate fold change
        cdf['foldChange'] = df['AVG_{}'.format(n)] / df['AVG_{}'.format(d)]
        # Run T-Test to get p-value
        #  - will spit errors if all zeros for expression, but can ignore
        cdf['pValue'.format(name)] = ttest(df[cond_di[n]],
                                           df[cond_di[d]],
                                           axis=1,
                                           equal_var=True)[1]
        cdf = cdf.replace([np.inf, np.nan], 'NA')
        tdf = pd.concat([tdf, cdf], axis=1)
        cdf['AVG_{}'.format(n)] = df['AVG_{}'.format(n)]
        cdf['AVG_{}'.format(d)] = df['AVG_{}'.format(d)]
        cdf = cdf.sort_values(by=['pValue'.format(name)], ascending=[True])
        cdf.to_csv(path_or_buf='{}/{}.csv'.format(out_path, name), sep=',')

    df = pd.concat([tdf, df], axis=1)
    return df
def get_pval(scores1, scores2):
    scores_joined = scores1.join(scores2, how='inner', lsuffix='_1', rsuffix='_2')
    ttest_results = ttest(scores_joined['score_1'], scores_joined['score_2'])
    pval = ttest_results[1]
    if np.isnan(pval):
        pval = 0
    return pval
Exemple #11
0
    def post(self):
        """
        Analyses the results from a data dissemination quiz and gives the result.
        The result can be either that stereotypes were found, or that they weren't found.
        """
        validators = {
            "data": valid.validate_dissemination_answers,
        }

        data = valid.validate(valid.read_form_data(request), validators)
        if not data:
            return ANSWERS[400], 400

        data = valid.read_form_data(request)

        question3, block_3_answers = self.get_block_information(2, data)
        question5, block_5_answers = self.get_block_information(4, data)

        t_statistic, p_value = ttest(block_3_answers, block_5_answers, equal_var=False)

        response = DISSEMINATION_NO_ASSOCIATION

        if p_value <= 0.1:
            if t_statistic < 0:
                response = DISSEMINATION_RESULT_MALE
            else:
                response = DISSEMINATION_RESULT_FEMALE

        if 'email' in data and valid.validate_email(data['email']):
            self.send_email(res=response, email=data['email'])

        return response, 200
Exemple #12
0
 def summarize(self, question_scores=None, summary_stats=('mean', )):
     if not question_scores:
         question_scores = self.question_scores
     summary = defaultdict(lambda: defaultdict(lambda: defaultdict()))
     for summary_stat in summary_stats:
         print '=========== %s ===========' % summary_stat
         for question, agent_scores in question_scores.iteritems():
             if self.question_type(
                     question) == 'str' or question not in self.questions:
                 continue
             for agent, scores in agent_scores.iteritems():
                 print agent, np.histogram([x[2] for x in scores],
                                           bins=5)[0]
             results = [(agent, self.summarize_scores(scores, summary_stat),
                         self.get_total(scores))
                        for agent, scores in agent_scores.iteritems()]
             results = sorted(results, key=lambda x: x[1][0], reverse=True)
             agent_ratings = {}
             for i, (agent, stat, total) in enumerate(results):
                 agent_ratings[agent] = stat[1]
                 summary[question][agent]['score'] = stat[0]
                 summary[question][agent]['sem'] = sem(
                     stat[1]) if len(stat[1]) > 1 else 0
                 summary[question][agent]['total'] = total
                 summary[question][agent]['ttest'] = ''
             # T-test
             agents = self.agents
             for i in range(len(agents)):
                 for j in range(i + 1, len(agents)):
                     try:
                         result = ttest(agent_ratings[agents[i]],
                                        agent_ratings[agents[j]])
                     except KeyError:
                         continue
                     #print agents[i], agents[j], result
                     t, p = result
                     if p < 0.05:
                         if t > 0:
                             win_agent, lose_agent = agents[i], agents[j]
                         else:
                             win_agent, lose_agent = agents[j], agents[i]
                         summary[question][win_agent][
                             'ttest'] += lose_agent[0]
         # Print
         for question, agent_stats in summary.iteritems():
             print '============= %s ===============' % self.question_labels[
                 question]
             print '{:<12s} {:<10s} {:<10s} {:<10s} {:<10s}'.format(
                 'agent', 'avg_score', 'error', '#score', 'win')
             print '---------------------------------------'
             for i, agent in enumerate(agents):
                 stats = agent_stats[agent]
                 try:
                     print '{:<12s} {:<10.1f} {:<10.2f} {:<10d} {:<10s}'.format(
                         self.agent_labels[agent], stats['score'],
                         stats['sem'], stats['total'], stats['ttest'])
                 except KeyError:
                     continue
     return summary
def second_level(first_level_tests):
    mu = np.zeros(shape=(len(list(first_level_tests))))
    t, p = ttest(first_level_tests.values, popmean=mu, axis=0)
    res_df = pd.DataFrame(index=['t', 'p'], columns=list(first_level_tests))
    res_df.loc['t'] = t
    res_df.loc['p'] = p

    return res_df
Exemple #14
0
def test_ttest(nrep,nqs):
    pvec = []
    g1 = np.round(5*np.random.random([nrep,nqs]))
    g2 = np.round(5*np.random.random([nrep,nqs]))
    for i in range(nqs):
        t,p = ttest(g1[:,i],g2[:,i])
        pvec.append(p)
    return pvec
Exemple #15
0
def eval_scores_vs(baseline_scores, model_scores, badness_threshold):
    ''' print and display compar\ison two sets of scores against each other '''
    diffs = np.array(model_scores) - np.array(baseline_scores)
    print(np.mean(baseline_scores), np.mean(model_scores))
    print("t-test", ttest(baseline_scores, model_scores))
    print("z-score", np.mean(diffs) / np.std(diffs))
    maxx = np.max(np.abs(diffs))
    print("baseline below thresh", (np.array(baseline_scores) < badness_threshold).mean())
    print("model below thresh", (np.array(model_scores) < badness_threshold).mean())
Exemple #16
0
def eval_scores_vs(baseline_scores, model_scores, badness_threshold):
    diffs = np.array(model_scores) - np.array(baseline_scores)
    print(np.mean(baseline_scores), np.mean(model_scores))
    print("t-test", ttest(baseline_scores, model_scores))
    print("z-score", np.mean(diffs) / np.std(diffs))
    maxx = np.max(np.abs(diffs))
    print("baseline below thresh",
          (np.array(baseline_scores) < badness_threshold).mean())
    print("model below thresh",
          (np.array(model_scores) < badness_threshold).mean())
Exemple #17
0
 def pulsevarUpdate(self, acc_vec):
     # --- Test if higher than accthr
     try:
         r = ttest(acc_vec, self.accthr)
     except ZeroDivisionError:
         if all(acc_vec) <= 0:
             return self.pulse_decrease
         elif all(acc_vec) >= 0:
             return self.pulse_increase
         else:
             return self.pulse_nochange
     if r[0] > 0 and r[1] < self.halfa:
         return self.pulse_increase
     # --- Test if lower than accthr
     r = ttest(acc_vec, max(0.01, self.accthr - 0.15))
     if r[0] < 0 and r[1] < self.halfa:
         return self.pulse_decrease
     # --- Return default
     return self.pulse_nochange
def process_data(data_exp, data_control, N_per_cluster, N_clusters, \
                 inter_cluster_SD, intra_cluster_SD, data_method, ttest_method):
    """ This is the function to process data
    There are several types of processing
    By default it is use simple t-test on pooled data (ignore clustering)
    INPUT: 1) the parameters for data generating
            2) data_method = {‘pool’, ‘cluster’}, optional
               choose the type of data to process furter
               ( if 'pool', use the pooled data
               elif 'cluster_means' use the means of clusters )
            3) ttest_method = {'simple', 'adjusted'}, optional
               choose what type of ttest to apply For more information read methods.md
     """

    if data_method == 'pool':  # use pooled data for processing
        # pool the data into a list:
        data_exp_pooled = data_exp.reshape(-1)
        data_control_pooled = data_control.reshape(-1)
        #print(data_exp, data_control)
        if ttest_method == 'simple':
            # use simple t-test
            t, p_value = ttest(data_exp_pooled, data_control_pooled)
        elif ttest_method == 'adjusted':  # use adjusted t-test
            t, p_value = adj_ttest(N_per_cluster, N_clusters, inter_cluster_SD, \
            intra_cluster_SD, data_exp_pooled, data_control_pooled)
        else:
            print('insert correct t-test method')
    elif data_method == 'cluster':  # use means of clusters for processing
        data_exp_mean = data_exp.mean(axis=0)
        data_control_mean = data_control.mean(axis=0)
        if ttest_method == 'simple':
            t, p_value = ttest(data_exp_mean, data_control_mean)
        elif ttest_method == 'adjusted':
            print('can\'t do adjusted t-test. Need pooled data')
            return
        else:
            print('insert correct t-test method')
    return p_value
def run_second_level(group_df):
    mus = np.zeros(len(list(group_df)))
    tvals, pvals = ttest(proj_utils.clean_df_to_numpy(group_df),
                         popmean=mus,
                         axis=0)

    res_df = pd.DataFrame(index=['t_values', 'p_values'],
                          columns=list(group_df))
    res_df.iloc[0] = tvals
    del tvals
    res_df.iloc[1] = pvals
    del pvals

    return res_df
Exemple #20
0
def main():
	cv_scores=pd.read_csv('./aty_best_arch_cv.csv',index_col=0)
	cv_scores.columns=['model_name','cv_loss','cv_std']


	best_assay=cv_scores[cv_scores['cv_loss']==cv_scores['cv_loss'].min()]

	equal_combinations=[]
	for index,combin in cv_scores.iterrows():
		# print(combin)
		t,p=ttest(best_assay['cv_loss'],best_assay['cv_std'],10,combin['cv_loss'],combin['cv_std'],10)
		if p>=0.05:
			equal_combinations.append(combin)
	print(best_assay)
	print(pd.DataFrame(equal_combinations).sort_values(['cv_loss']))
Exemple #21
0
 def run_test(self, num_timesteps_back, alpha=0.05):
     results = []
     for fn, data in self.stats.items():
         try:
             adf_res = adf(data[-num_timesteps_back:])[1] < alpha
         except ValueError as e:
             adf_res = None
         try:
             ttest_res = ttest(
                 data[int(-num_timesteps_back):int(-num_timesteps_back /
                                                   2)],
                 data[int(-num_timesteps_back / 2):])[1] > alpha
         except ValueError as e:
             ttest_res = None
         results.append(adf_res and ttest_res)
     return np.all(results)
Exemple #22
0
 def pulsevarUpdate(self, acc_vec):
     # --- Test if mean(acc_vec) is equal to accthr
     try:
         r = ttest(acc_vec, self.accthr)
     except ZeroDivisionError:
         if all(acc_vec) <= 0 and self.allow_pdecr:
             return self.pulse_decrease
         elif all(acc_vec) >= 0 and self.allow_pincr:
             return self.pulse_increase
         else:
             return self.pulse_nochange
     # ---
     if r[1] >= self.halfa: return self.pulse_nochange
     if r[0] > 0 and self.allow_pincr: return self.pulse_increase
     if r[0] < 0 and self.allow_pdecr: return self.pulse_decrease
     # --- Return default
     return self.pulse_nochange
Exemple #23
0
def H4_testing(exp_names, trials, load_dir):
    write_file = 'hLogs/H4.txt'
    mean_sims = []
    std_sims = []
    for name in exp_names:
        sims = []
        for i in range(trials):
            print(i)
            load_file = load_dir + name + str(i) + '.pkl'
            with open(load_file, 'rb') as f:
                if 'ST' in name:
                    pop, _, _, _, _, _, _, _ = pickle.load(f)
                else:
                    pop, _, _, _, _, _, _ = pickle.load(f)
            sim = get_trainset_similarity(pop)
            sims.append(sim)
        mean_sim = np.mean(sims)
        mean_sims.append(mean_sim)
        std_sim = np.std(sims)
        std_sims.append(std_sim)

        if 'DIF_SF' in name:
            mean_sim_naive = mean_sim
            std_sim_naive = std_sim

    for i in range(len(exp_names)):
        name = exp_names[i]
        mean_sim = mean_sims[i]
        std_sim = std_sims[i]
        if 'DIF' in name:
            with open(write_file, 'a+') as f:
                f.write(name + '\n')
                f.write('Mean Hausdorffs: {} \n'.format(mean_sim))
                f.write('STD Hausdorffs: {} \n'.format(std_sim))
                f.write('\n')
        if 'DIT' in name:
            pct_improve = -(mean_sim - mean_sim_naive) / mean_sim_naive
            pval_naive = ttest(mean_sim, std_sim, trials, mean_sim_naive,
                               std_sim_naive, trials)
            with open(write_file, 'a+') as f:
                f.write(name + '\n')
                f.write('Mean Hausdorffs: {} \n'.format(mean_sim))
                f.write('STD Hausdorffs: {} \n'.format(std_sim))
                f.write('Mean Percent Improvement: {} \n'.format(pct_improve))
                f.write('Pval Naive: {} \n'.format(pval_naive))
                f.write('\n')
def update_stats(x_val, y_val, n):
    confidence = [.9, .95, .97, .99]
    a = round(1 - confidence[n], 2)
    df = pd.read_csv("../data/style_plus_beer.csv")
    df_style_abv = df[['Super Style', 'abv']]

    df_x = df_style_abv[df_style_abv['Super Style'] == x_val]
    df_y = df_style_abv[df_style_abv['Super Style'] == y_val]

    results = ttest(np.array(df_x['abv'].dropna()),
                    np.array(df_y['abv'].dropna()),
                    equal_var=False)
    p_value = round(results[1], 4)
    if p_value > a:
        output_p = f"p = {p_value}, which is greater than our alpha value of {a}, therefore we fail to reject the null hypothesis. Then we are {round(confidence[n]*100,0)}% confident that there is no difference between the averge values of the 2 populations."
    else:
        output_p = f"p = {p_value}, which is less than our alpha value of {a}, therefore we reject the null hypothesis. Then we are {round(confidence[n]*100,0)}% confident that there is a statistical difference between the average values of the 2 populations."
    return output_p
Exemple #25
0
def summarize(question_scores):
    summary = defaultdict(lambda: defaultdict(lambda: defaultdict()))
    for summary_stat in ('mean', ):
        #print '=========== %s ===========' % summary_stat
        for question, agent_scores in question_scores.iteritems():
            if question == 'comments' or question.endswith('text'):
                continue
            results = [(agent, summarize_scores(scores, summary_stat),
                        get_total(scores))
                       for agent, scores in agent_scores.iteritems()]
            results = sorted(results, key=lambda x: x[1][0], reverse=True)
            agent_ratings = {}
            for i, (agent, stat, total) in enumerate(results):
                agent_ratings[agent] = stat[1]
                summary[question][agent]['score'] = stat[0]
                summary[question][agent]['total'] = total
                summary[question][agent]['ttest'] = ''
            # T-test
            agents = ('human', 'rulebased', 'static-neural', 'dynamic-neural')
            for i in range(len(agents)):
                for j in range(i + 1, len(agents)):
                    result = ttest(agent_ratings[agents[i]],
                                   agent_ratings[agents[j]])
                    #print agents[i], agents[j], result
                    t, p = result
                    if p < 0.05:
                        if t > 0:
                            win_agent, lose_agent = agents[i], agents[j]
                        else:
                            win_agent, lose_agent = agents[j], agents[i]
                        summary[question][win_agent]['ttest'] += lose_agent[0]
        # Print
        agent_labels = ('Human', 'Rule-based', 'StanoNet', 'DynoNet')
        for question, agent_stats in summary.iteritems():
            print '============= %s ===============' % question.upper()
            print '{:<12s} {:<10s} {:<10s} {:<10s}'.format(
                'agent', 'avg_score', '#score', 'win')
            print '---------------------------------------'
            for i, agent in enumerate(agents):
                stats = agent_stats[agent]
                print '{:<12s} {:<10.1f} {:<10d} {:<10s}'.format(
                    agent_labels[i], stats['score'], stats['total'],
                    stats['ttest'])
    return summary
Exemple #26
0
def main():
    cv_scores=pd.read_csv('./aty_best_arch_cv.csv',index_col=0)
    cv_scores.columns=['model_name','cv_loss','cv_std']
    ## This access the aty_best_arch_cv file and labels the three columns 'model_name', 'cv_loss' and 'cv_std'
    ## Then assigns this to a variable cv_score
    best_assay=cv_scores[cv_scores['cv_loss']==cv_scores['cv_loss'].min()]
    ## best_assay is a data frame object which takes the model with the least cv_loss
    ## equal_combinations is an empty list variable.
    equal_combinations=[]
    for index,combin in cv_scores.iterrows():
        ## The above iterates through every row of the data frame and accesses each data row
        t,p=ttest(best_assay['cv_loss'],best_assay['cv_std'],10,combin['cv_loss'],combin['cv_std'],10)
        ## A t-test is performed against of each data row against the row with the smallest cv_loss value
        ## The t-test is performed assuming that both the data sets have equal variance i.e they passed the null hypothesis test (F-test)
        if p>=0.05:
            equal_combinations.append(combin)
        ## The data set is added to the empty list created if the data set is not significantly different from the best_assay as predicted by the ttest
    print(best_assay)
    print(pd.DataFrame(equal_combinations).sort_values(['cv_loss']))
def differential_analysis(test_norm, control_norm):
    '''
    calculated fold change on log transformed data.
    ========
    Paremeters:
    test_norm: pandas.dataframe, a dataframe of normalized test data, rows as cells and columns as features.
    control_norm: pandas.dataframe, a dataframe of normalized control data, rows as cells and columns as features.
    '''
    report = pd.DataFrame(
        columns=['logFC', 'T_pValue', 'KS_pValue', 'adj_T_pVal', 'adj_KS_pVal'])
    for feature in control_norm.columns:
        fc = test_norm[feature].mean() - control_norm[feature].mean()
        pval = ttest(control_norm[feature], test_norm[feature])
        ks_pval = ks_2samp(control_norm[feature], test_norm[feature])[1]
        report.loc[feature, 'logFC'] = np.round(fc, 2)
        report.loc[feature, 'T_pValue'] = pval[1]
        report.loc[feature, 'KS_pValue'] = ks_pval
    report['adj_T_pVal'] = fdr(report.T_pValue)[1]
    report['adj_KS_pVal'] = fdr(report.KS_pValue)[1]
    return report
Exemple #28
0
def H1_testing(exp_names, trials, load_dir):
    write_file = 'hLogs/H1.txt'
    for name in exp_names:
        init_scores = []
        final_scores = []
        pct_improves = []
        for i in range(trials):
            load_file = load_dir + name + str(i) + '.pkl'
            with open(load_file, 'rb') as f:
                if 'ST' in name:
                    _, _, learn_curve, _, _, _, _, _ = pickle.load(f)
                else:
                    _, learn_curve, _, _, _, _, _ = pickle.load(f)
            init_score = learn_curve[0]
            final_score = learn_curve[-1]
            diff_score = final_score - init_score
            pct_improve = diff_score / init_score

            init_scores.append(init_score)
            final_scores.append(final_score)
            pct_improves.append(pct_improve)

        mean_init = np.mean(init_scores)
        std_init = np.std(init_scores)
        mean_final = np.mean(final_scores)
        std_final = np.std(final_scores)
        mean_pct_improve = -np.mean(pct_improve)
        std_pct_improve = np.std(pct_improve)
        p_diff = ttest(mean_final, std_final, trials, mean_init, std_init,
                       trials)

        with open(write_file, 'a+') as f:
            f.write(name + '\n')
            f.write('Mean Initial Score: {} \n'.format(mean_init))
            f.write('STD Initial Score: {} \n'.format(std_init))
            f.write('Mean Final Score: {} \n'.format(mean_final))
            f.write('STD Final Score: {} \n'.format(std_final))
            f.write('Mean Percent Improvement: {} \n'.format(mean_pct_improve))
            f.write('STD Percent Improvement: {} \n'.format(std_pct_improve))
            f.write('Pval Inital-Final Difference: {} \n'.format(p_diff))
            f.write('\n')
    def postPreTTest(self,df,isTreatmentGroup=1,equal_var=True,groupbyidentifier=True):
        """
        Run a T-test on the pre group versus the post group, grouped by the identifier and taking the mean.

        We are comparing the means of each identifier before and after the test.
        """
        postTreatmentDataFrame, preTreatmentDataFrame = self._ttest_with_identifier_aggregation_setup(df, isTreatmentGroup)

        if groupbyidentifier:
            postTreatmentDataFrame = postTreatmentDataFrame.groupby(["identifier"]).mean()
            preTreatmentDataFrame = preTreatmentDataFrame.groupby(["identifier"]).mean()
        testStatistic, pValue = ttest(
            np.array(postTreatmentDataFrame["kpi"]),
            np.array(preTreatmentDataFrame["kpi"]),
            equal_var=equal_var
        )
        
        estimate = np.mean(np.array(postTreatmentDataFrame["kpi"]))-np.mean(np.array(preTreatmentDataFrame["kpi"]))
        return {"test statistic": testStatistic
        ,"p-value": pValue
        ,"estimate": estimate}
Exemple #30
0
def sampleDiff(sa, sb, fcmax, fcmin, size=1000):
    """
    Sampling cell numbers to specific size and get the differential test results.
    @param sa: np.array
    @param sb: np.array
    @param size: int
    """
    #sampling
    csa = list(range(len(sa)))
    csb = list(range(len(sb)))
    random.shuffle(csa)
    random.shuffle(csb)
    csa = csa[:size]
    csb = csb[:size]
    #estimating
    a = sa[csa]
    b = sb[csb]
    ea = a[a > 0]
    eb = b[b > 0]
    ea = float(len(ea)) / len(a)
    eb = float(len(eb)) / len(b)
    t, p = ttest(a, b)
    ma = np.mean(a)
    mb = np.mean(b)
    fc = np.log2(ma) - np.log2(mb)
    if fc > fcmax:
        fc = fcmax
    if fc < fcmin:
        fc = fcmin
    rs = {
        "fc": fc,
        "meanExp": ma,
        "expRatio": ea,
        "meanExpOthers": mb,
        "expRatioOthers": eb,
        "p-value": p,
        "expRatioDiff": ea - eb
    }
    rs = pd.Series(rs)
    return rs
Exemple #31
0
def checkDiff(mata,matb,fout,fccut=1,pcut=1e-5,expr=0.2):
    """
    @param mata: pd.DataFrame, matrix for cluster a
    @param matb: pd.DataFrame, matrix for other clusters
    @param fccut: float, log2 fold change cutoff
    @param pcut: float, t-test p-value cutoff
    @param expr: float, expressed cells 
    """
    s = mata.sum(axis=1)
    s = s[s>=s.median()]
    ns = s.index
    rs = {}
    ts = []
    for n in tqdm(mata.index):
        a = mata.loc[n,]
        b = matb.loc[n,]
        ea = a[a>0]
        eb = b[b>0]
        ea = float(len(ea)) / len(a)
        eb = float(len(eb)) / len(b)
        t,p = ttest(a,b)
        ma = np.mean(a)
        mb = np.mean(b)
        fc = np.log2(ma)-np.log2(mb)
        rs[n] = {"fc":fc,"meanExp":ma,"expRatio":ea,"meanExpOthers":mb,"expRatioOthers":eb,"p-value":p,"expRatioDiff": ea-eb}
        if fc > fccut and p < pcut and ea >= expr and n in ns:
            rs[n]["sig"] = 1
            ts.append( n  )
        else:
            rs[n]["sig"] = -1
    rs = pd.DataFrame(rs).T
    rs = rs.fillna(0)
    rs.to_csv(fout+".txt",sep="\t")
    s = rs.loc[ts,"fc"]
    s = s.sort_values(ascending=False)
    print(s)
    with open(fout+".list","w") as fo:
        ns = [t.split("|")[0] for t in s.index]
        fo.write("\n".join(ns))
    return s.index
Exemple #32
0
def p_table(data, interest, reference, intcap='Conditions of interest', refcap='', caption='', label='', width=0.9, mode='rel',means_only=True): #make table with t-test p-values
    if mode == 'rel':
	from scipy.stats import ttest_rel as ttest
    #~ if mode == 'ind':
	#~ from scipy.stats import ttest_ind as ttest
	
    len_compare = len(reference)-1
    len_interest = len(interest)-1
    table_form = '{r|'+'Y|'*len_compare+'}'
    first_line = intcap+'& \\multicolumn{'+str(len_compare)+'}{c}{'+refcap+'}\\\\\n'
    second_line = '&'+'&'.join([str(i) for i in reference[1:]])+'\\\\\n'+'\\cline{2-'+str(len(reference))+'}\n'
    
    end_tabular = '\\end{tabularx}\n'
    caption = '\\caption{'+caption+'}\n'
    label = '\\label{'+label+'}\n'
    footer = '\\end{center}\n \\end{table}'
    
    latex = '\\begin{table}[!htbp]\n \\begin{center}\n \\begin{tabularx}{'+str(width)+'\\textwidth}'+table_form
    latex += first_line
    latex += second_line
    
    for i in interest[1:]:
	line = i
	for r in reference[1:]:
	    cell = '&'+tex_nr(ttest(data[(data[interest[0]]==i)].groupby('ID')['RT'].mean(),data[(data[reference[0]]==r)].groupby('ID')['RT'].mean())[1])
	    line += cell
	line += '\\\\\n'
	latex += line
	    
    
    latex += end_tabular
    latex += caption
    latex += label
    latex += footer
    
    return latex
#using model_comparison without genre as a feature
recall, precision, f1_score = model_comparison(features, label_to_numbers(labels, dataset_class_histogram(dataset)), models, parameters_to_optimize, False)

#printing results without genre as a parameter
print ("without genre as a parameter \n Score \n")
print np.average(f1_score, axis=0)

if len(models) > 2:
#    print [f1_score[:,i].T for i in range(len(models))]
    print "Anova: ", f_oneway( *f1_score.T  )[1]
    
    
else:
    
    print "T-test:", ttest( f1_score[:,0].T,  f1_score[:,1].T)[1]


# #using model_comparison with genre as a feature
# recall, precision, f1_score = model_comparison(features, label_to_numbers(labels, dataset_class_histogram(dataset)), models, parameters_to_optimize, True)

# #printing results with genre as a parameter
# print ("genre is a feature \n")
# print np.average(f1_score, axis=0)

# if len(models) > 2:
# #    print [f1_score[:,i].T for i in range(len(models))]
#     print "Anova: ", f_oneway( *f1_score.T  )[1]
    
    
# else:
burns_slice = burns_slice[['KEY', 'AGE', 'age_bins', 'DIED', 'DX_BURN']]
burns_agerangegrouping = burns_slice[['AGE','age_bins','DIED']].dropna().sort('AGE').groupby('age_bins')
burns_deathrates_proto1 = burns_agerangegrouping.aggregate([np.sum, 'count'])
burns_deathrates_proto1['DIED']

#create death rate variable
burns_deathrates_proto1['death_rate'] = burns_deathrates_proto1['DIED']['sum'] / burns_deathrates_proto1['DIED']['count']

#plot
f, ax = plt.subplots(figsize=(6, 5))
agerange_mortality_plot = sns.barplot(x=burns_deathrates_proto1.index, y=burns_deathrates_proto1['death_rate'], color='maroon')
agerange_mortality_plot.axes.set(title="Mortality rate by age range (CA only, burn victims only)", xlabel='Age ranges', ylabel='mortality rate')

#do ttest on deathrate by age range
stats.ttest(burns_deathrates_proto1[['death_rate']])


#or instead
burns_deathrates_proto2 = pd.crosstab(burns_slice.age_bins, burns_slice.DIED)
burns_deathrates_proto2['Total'] = burns_deathrates_proto2[0.0] + burns_deathrates_proto2[1.0]


#plot
sns.set(style="whitegrid")
f, ax = plt.subplots(figsize=(6, 5))
sns.set_color_codes("pastel")
sns.barplot(x=burns_deathrates_proto2['Total'], y=burns_deathrates_proto2.index, 
            data=burns_deathrates_proto2, label="Total", color="b")
sns.set_color_codes("muted")
sns.barplot(x=burns_deathrates_proto2[1.0], y=burns_deathrates_proto2.index, 
def main():

	with open('/mnt/scratch/noa/pclproj/results/args_lab_data.json') as data_file:
		args = json.load(data_file)

	with open('/mnt/scratch/noa/pclproj/results/labels_objects_dict.json') as data_file:
		labels = json.load(data_file)

	#tot_acc = np.array([0.0, 0.0, 0.0])
	tot_acc = [[], [], []]
	dates_list = ['1601', '1801', '2101', '2701', '0302', '0702', '1202']
	for date_folder in dates_list:
		print date_folder
		true_labels_rgb, predictions_rgb = get_truth_predictions('/home/noa/pcl_proj/experiments/cifar10/every_fifth_view/'+
													  date_folder+'/rgb/deploy.prototxt',
													'/home/noa/pcl_proj/experiments/cifar10/every_fifth_view/' 
													+date_folder+'/rgb/snapshots/_iter_200000.caffemodel',
													args["views_files"], args["coors_files"], 
													'/home/noa/pcl_proj/experiments/mean_image_files/'
													+date_folder+'/mean_image_training_fifth_rgb.binaryproto',
													labels)
		true_labels_hist, predictions_hist = get_truth_predictions('/home/noa/pcl_proj/experiments/cifar10/every_fifth_view/'+
											  date_folder+'/hist/deploy.prototxt',
											'/home/noa/pcl_proj/experiments/cifar10/every_fifth_view/' 
											+date_folder+'/hist/snapshots/_iter_200000.caffemodel',
											args["views_files"], args["coors_files"], 
											'/home/noa/pcl_proj/experiments/mean_image_files/'
											+date_folder+'/mean_image_training_fifth_hist.binaryproto',
											labels)
		
		true_labels_rgb_hist, predictions_rgb_hist = get_truth_predictions('/home/noa/pcl_proj/experiments/cifar10/every_fifth_view/'+
											  date_folder+'/rgb_hist/deploy.prototxt',
											'/home/noa/pcl_proj/experiments/cifar10/every_fifth_view/' 
											+date_folder+'/rgb_hist/snapshots/_iter_200000.caffemodel',
											args["views_files"], args["coors_files"], 
											['/home/noa/pcl_proj/experiments/mean_image_files/'
											+date_folder+'/mean_image_training_fifth_hist.binaryproto', '/home/noa/pcl_proj/experiments/mean_image_files/'
											+date_folder+'/mean_image_training_fifth_rgb.binaryproto'],
											labels)
		acc = [0.0, 0.0, 0.0]
		for i in range(len(predictions_rgb)):
			if true_labels_rgb[i] == predictions_rgb[i]:
				acc[0] += 1.0
			if true_labels_hist[i] == predictions_hist[i]:
				acc[1] += 1.0
			if true_labels_rgb_hist[i] == predictions_rgb_hist[i]:
				acc[2] += 1.0
		tot_acc[0].append(acc[0] / len(predictions_rgb))
		tot_acc[1].append(acc[1] / len(predictions_hist))
		tot_acc[2].append(acc[2] / len(predictions_rgb_hist))		
		print acc
	print tot_acc
	print 'ttests: '
	statistics_0, p_0 = ttest(np.array(tot_acc[0]), np.array(tot_acc[1]))
	sttistics_1, p_1 = ttest(np.array(tot_acc[1]), np.array(tot_acc[2]))
	sttistics_2, p_2 = ttest(np.array(tot_acc[0]), np.array(tot_acc[2]))


	print "final: "
	print 'rgb:'
	print np.array(tot_acc[0]).mean()
	print np.array(tot_acc[0]).std()
	print 'hist:'
	print np.array(tot_acc[1]).mean()
	print np.array(tot_acc[1]).std()
	print 'rgb_hist:'
	print np.array(tot_acc[2]).mean()
	print np.array(tot_acc[2]).std()

	print 'rgb vs. hist'
	print p_0
	print 'hist vs. rgb_hist'
	print p_1
	print 'rgb vs. rgb_hist'
	print p_2
def composite(hour):
    pool = multiprocessing.Pool(processes=5)

    file = cnst.MCS_POINTS_DOM #MCS_TMIN #
    path = cnst.network_data + '/figs/LSTA-bullshit/AGU' #corrected_LSTA/wavelet/large_scale

    hour = hour

    msg = xr.open_dataarray(file)
    msg = msg[(msg['time.hour'] == hour) & (msg['time.minute'] == 0) & (
        msg['time.year'] >= 2006) & (msg['time.year'] <= 2010) & (msg['time.month'] >= 6) ]

    msg = msg.sel(lat=slice(10.2,19.3), lon=slice(-9.7, 9.7))
    res = pool.map(file_loop, msg)
    pool.close()
    print('return parallel')
    # res = []
    # for m in msg[0:30]:
    #     out = file_loop(m)
    #     res.append(out)

    res = [x for x in res if x is not None]

    snpos_list = []
    wepos_list = []

    rsnpos_list = []
    rwepos_list = []

    vkernel_list = []
    rkernel_list = []

    vkernel_cnt = []
    rkernel_cnt = []

    lsta_list = []

    for r in res:
        snpos_list.append(np.squeeze(r[0]))
        wepos_list.append(np.squeeze(r[1]))

        rsnpos_list.append(np.squeeze(r[2]))
        rwepos_list.append(np.squeeze(r[3]))

        vkernel_list.append(r[4])
        rkernel_list.append(r[5])

        scales = r[6]

        vkernel_cnt.append(r[7])
        rkernel_cnt.append(r[8])

        lsta_list.append(r[9])

    dic = collections.OrderedDict([('SN-pos' , [snpos_list, rsnpos_list]),
                                   ('WE-pos' , [wepos_list, rwepos_list]),
                                   ('kernel' , [vkernel_list, rkernel_list]),
                                   ('lsta' , [lsta_list]),
                                   ('cnt' , [vkernel_cnt, rkernel_cnt]),
                                   ('scales' , scales)])

    keys = list(dic.keys())

    for l in keys:
        if l == 'scales':
            continue
        (dic[l])[0] = np.squeeze(np.vstack((dic[l])[0]))
        try:
            (dic[l])[1] = np.squeeze(np.vstack((dic[l])[1]))
        except IndexError:
            continue

    dic['nbcores'] = dic['SN-pos'][0].shape[0]
    dic['nbrcores'] = dic['SN-pos'][1].shape[0]

    for l in keys:
        if (l == 'scales') | (l == 'lsta'):
            continue

        a =  dic[l][0]
        b =  dic[l][1]
        sa = a.shape
        sb = b.shape
        #if 'pos' in l:
        #    a = (a.swapaxes(0,1).reshape(sa[1], sa[0]*sa[2]).T/np.nanstd(dic[l][0], axis=(0,2))).T.reshape(sa[1], sa[0],sa[2]).swapaxes(0,1)
        #    b = (b.swapaxes(0,1).reshape(sb[1], sb[0]*sb[2]).T/np.nanstd(dic[l][1], axis=(0,2))).T.reshape(sb[1], sb[0], sb[2]).swapaxes(0,1)

        nsstat, nspvalue = ttest(a, b, axis=0, equal_var=False, nan_policy='omit')
        mask = nspvalue < 0.05
        dic[l].append(mask)

        if 'pos' in l:
            dic[l].append(np.nanstd(dic[l][0], axis=(0,2)))
            dic[l].append(np.nanstd(dic[l][1], axis=(0,2)))

    for l in keys:
        if l == 'scales':
            continue
        if 'pos' in l:
            (dic[l])[0] = np.nanmean((dic[l])[0], axis=0)
            (dic[l])[1] = np.nanmean((dic[l])[1], axis=0)
        else:
            (dic[l])[0] = np.nansum((dic[l])[0], axis=0)
            try:
                (dic[l])[1] = np.nansum((dic[l])[1], axis=0)
            except IndexError:
                continue


    pkl.dump(dic, open(path+"/coeffs_test_nans_stdkernel"+str(hour)+"UTC.p", "wb"))
    print('Save file written!')
import sys
from scipy.stats import ttest_ind as ttest
a = [float(x) for x in open(sys.argv[1])]
b = [float(x) for x in open(sys.argv[2])]

print ttest(a, b)
Exemple #38
0
def composite(hour):
    pool = multiprocessing.Pool(processes=5)

    file = constants.MCS_POINTS_DOM
    path = '/users/global/cornkle/figs/LSTA-bullshit/AGU' #corrected_LSTA/wavelet/large_scale

    hour = hour

    msg = xr.open_dataarray(file)
    msg = msg[(msg['time.hour'] == hour) & (msg['time.minute'] == 0) & (
        msg['time.year'] >= 2006) & (msg['time.year'] <= 2010) & (msg['time.month'] >= 6) ]

    msg = msg.sel(lat=slice(10.2,18.5), lon=slice(-9.7, 9.7))

    res = pool.map(file_loop, msg)
    pool.close()

    # for m in msg[2:5]:
    #     file_loop(m)
    # return
    res = [x for x in res if x is not None]

    snpos_list_dry = []
    wepos_list_dry = []

    rsnpos_list_dry = []
    rwepos_list_dry = []

    snpos_list_wet = []
    wepos_list_wet = []

    rsnpos_list_wet = []
    rwepos_list_wet = []

    for r in res:
        snpos_list_dry.append(np.squeeze(r[0]))
        wepos_list_dry.append(np.squeeze(r[1]))

        rsnpos_list_dry.append(np.squeeze(r[2]))
        rwepos_list_dry.append(np.squeeze(r[3]))


        scales = r[4]
        snpos_list_wet.append(np.squeeze(r[5]))
        wepos_list_wet.append(np.squeeze(r[6]))

        rsnpos_list_wet.append(np.squeeze(r[7]))
        rwepos_list_wet.append(np.squeeze(r[8]))


    dic = collections.OrderedDict([('SN-pos' , [snpos_list_dry, rsnpos_list_dry]),
                                   ('WE-pos' , [wepos_list_dry, rwepos_list_dry]),
                                   ('SN-pos_wet', [snpos_list_wet, rsnpos_list_wet]),
                                   ('WE-pos_wet', [wepos_list_wet, rwepos_list_wet]),
                                   ('scales' , scales)])

    keys = list(dic.keys())

    for l in keys:
        if l == 'scales':
            continue
        (dic[l])[0] = np.squeeze(np.vstack((dic[l])[0]))
        (dic[l])[1] = np.squeeze(np.vstack((dic[l])[1]))

    for l in keys:
        if l == 'scales':
            continue
        nsstat, nspvalue = ttest(dic[l][0], dic[l][1], axis=0, equal_var=False, nan_policy='omit')
        mask = nspvalue < 0.05
        dic[l].append(mask)

    nsstat, nspvalue = ttest(dic['SN-pos'][0], dic['SN-pos_wet'][0], axis=0, equal_var=False, nan_policy='omit')
    mask = nspvalue < 0.05
    dic['SN-dw_mask']   = mask

    nsstat, nspvalue = ttest(dic['WE-pos'][0], dic['WE-pos_wet'][0], axis=0, equal_var=False, nan_policy='omit')
    mask = nspvalue < 0.05
    dic['WE-dw_mask']   = mask

    for l in keys:
        if l == 'scales':
            continue

        (dic[l])[0] = np.nanmean((dic[l])[0], axis=0)
        (dic[l])[1] = np.nanmean((dic[l])[1], axis=0)


    pkl.dump(dic, open(path+"/test_wet_dry_withzero"+str(hour)+"UTC.p", "wb"))
    print('Save file written!')
def print_stats():
    from scipy.stats import ttest_ind as ttest
    
    mask=get_mask()
    sar=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SAR-ncep-pr-BC12km_full_res_annual_dryspell.nc").data,mask=mask)
    sdd=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SD-ncep-pr-BC12km_full_res_annual_dryspell.nc").data,mask=mask)
    sdm=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SDmon_c-ncep-pr-BC12km_full_res_annual_dryspell.nc").data,mask=mask)
    ca=np.ma.array(myio.read_nc(stats_location+onemm_loc+"CA-ncep-pr-BC12km_full_res_annual_dryspell.nc").data,mask=mask)
    obs=np.ma.array(myio.read_nc(stats_location+onemm_loc+"obs-maurer.125-pr_full_res_annual_dryspell.nc").data,mask=mask)
    print("AR    dryspell= "+str(sar.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sar[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCSDd dryspell= "+str(sdd.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sdd[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCSDm dryspell= "+str(sdm.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sdm[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCCA  dryspell= "+str( ca.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(ca[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("Obs   dryspell= "+str(obs.mean())[:5])
    print("")
    
    sar=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SAR-ncep-pr-BC12km_full_res_annual_wetspell.nc").data,mask=mask)
    sdd=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SD-ncep-pr-BC12km_full_res_annual_wetspell.nc").data,mask=mask)
    sdm=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SDmon_c-ncep-pr-BC12km_full_res_annual_wetspell.nc").data,mask=mask)
    ca=np.ma.array(myio.read_nc(stats_location+onemm_loc+"CA-ncep-pr-BC12km_full_res_annual_wetspell.nc").data,mask=mask)
    obs=np.ma.array(myio.read_nc(stats_location+onemm_loc+"obs-maurer.125-pr_full_res_annual_wetspell.nc").data,mask=mask)
    print("AR    wetspell= "+str(sar.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sar[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCSDd wetspell= "+str(sdd.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sdd[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCSDm wetspell= "+str(sdm.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sdm[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCCA  wetspell= "+str( ca.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(ca[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("Obs   wetspell= "+str(obs.mean())[:5])
    print("")
    
    sar=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SAR-ncep-pr-BC12km_full_res_annual_wetfrac.nc").data,mask=mask)
    sdd=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SD-ncep-pr-BC12km_full_res_annual_wetfrac.nc").data,mask=mask)
    sdm=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SDmon_c-ncep-pr-BC12km_full_res_annual_wetfrac.nc").data,mask=mask)
    ca=np.ma.array(myio.read_nc(stats_location+onemm_loc+"CA-ncep-pr-BC12km_full_res_annual_wetfrac.nc").data,mask=mask)
    obs=np.ma.array(myio.read_nc(stats_location+onemm_loc+"obs-maurer.125-pr_full_res_annual_wetfrac.nc").data,mask=mask)
    print("AR    wetfrac= "+str(sar.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sar[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCSDd wetfrac= "+str(sdd.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sdd[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCSDm wetfrac= "+str(sdm.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sdm[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCCA  wetfrac= "+str( ca.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(ca[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("Obs   wetfrac= "+str(obs.mean())[:5])
    print("")

    sar=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SAR-ncep-pr-BC12km_full_res_annual_MAP.nc").data,mask=mask)
    sdd=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SD-ncep-pr-BC12km_full_res_annual_MAP.nc").data,mask=mask)
    sdd.mask[~np.isfinite(sdd)]=True
    sdd.mask[sdd>1e5]=True
    sdm=np.ma.array(myio.read_nc(stats_location+onemm_loc+"SDmon_c-ncep-pr-BC12km_full_res_annual_MAP.nc").data,mask=mask)
    sdm.mask[~np.isfinite(sdm)]=True
    sdm.mask[sdm>1e5]=True
    ca=np.ma.array(myio.read_nc(stats_location+onemm_loc+"CA-ncep-pr-BC12km_full_res_annual_MAP.nc").data,mask=mask)
    obs=np.ma.array(myio.read_nc(stats_location+onemm_loc+"obs-maurer.125-pr_full_res_annual_MAP.nc").data,mask=mask)
    print("AR    MAP= "+str(sar.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sar[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCSDd MAP= "+str(sdd.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sdd[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCSDm MAP= "+str(sdm.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sdm[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCCA  MAP= "+str( ca.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(ca[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("Obs   MAP= "+str(obs.mean())[:5])
    print("")

    sar=np.ma.array(myio.read_nc(stats_location+zeromm_loc+"SAR-ncep-pr-BC12km_full_res_annual_extremes_nday1.nc").data[2,...],mask=mask)
    sdd=np.ma.array(myio.read_nc(stats_location+zeromm_loc+"SD-ncep-pr-BC12km_full_res_annual_extremes_nday1.nc").data[2,...],mask=mask)
    sdd.mask[~np.isfinite(sdd)]=True
    sdd.mask[sdd>1e5]=True
    sdm=np.ma.array(myio.read_nc(stats_location+zeromm_loc+"SDmon_c-ncep-pr-BC12km_full_res_annual_extremes_nday1.nc").data[2,...],mask=mask)
    sdm.mask[~np.isfinite(sdm)]=True
    sdm.mask[sdm>1e5]=True
    ca=np.ma.array(myio.read_nc(stats_location+zeromm_loc+"CA-ncep-pr-BC12km_full_res_annual_extremes_nday1.nc").data[2,...],mask=mask)
    obs=np.ma.array(myio.read_nc(stats_location+zeromm_loc+"obs-maurer.125-pr_full_res_annual_extremes_nday1.nc").data[2,...],mask=mask)
    print("AR    extremes_nday1= "+str(sar.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sar[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCSDd extremes_nday1= "+str(sdd.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sdd[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCSDm extremes_nday1= "+str(sdm.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(sdm[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("BCCA  extremes_nday1= "+str( ca.mean())[:5])
    print('  pvalue = {0}'.format(round(ttest(ca[mask==False], obs[mask==False])[1]*1000.0)/1000.0))
    print("Obs   extremes_nday1= "+str(obs.mean())[:5])
    print("")
Exemple #40
0
	def ttest(self, keys, dv, paired=True, collapse=None):

		"""
		desc:
			Performs t-tests between groups defined by a list of keys.

		arguments:
			keys:
				desc:	A list of keys to define the groups.
				type:	list
			dv:
				desc:	The dependent variable.
				type:	[str, unicode]

		keywords:
			paired:
				desc:	Determines whether a paired-samples t-test or an
						independent samples t-test should be conducted.
				type:	bool
			collapse:
				desc:	A key to collapse the data on, so that you can do
						t-tests on (subject) means.
				type:	[str, unicode, NoneType]

		returns:
			desc:	A list of (desc, t, p) tuples.
			type:	list
		"""

		from itertools import combinations
		if paired:
			from scipy.stats import ttest_rel as ttest
		else:
			from scipy.stats import ttest_ind as ttest

		if collapse != None:
			dm = self.collapse(collapse + keys, dv)
			dv = 'mean'
		else:
			dm = self

		_l = [['group', 'N', 'M / t', 'SE / p']]

		lDm = dm.group(keys)
		for l in combinations(lDm, 2):

			group0 = ''
			for key in keys:
				group0 += str(l[0][key][0]) + '_'
			group0 = group0[:-1]

			group1 = ''
			for key in keys:
				group1 += str(l[1][key][0]) + '_'
			group1 = group1[:-1]

			N0 = len(l[0])
			M0 = l[0][dv].mean()
			SE0 = l[0][dv].std() / np.sqrt(len(l[0]))
			_l.append( [group0, N0, M0, SE0] )

			N1 = len(l[1])
			M1 = l[1][dv].mean()
			SE1 = l[1][dv].std() / np.sqrt(len(l[1]))
			_l.append( [group1, N1, M1, SE1] )

			t, p = ttest(l[0][dv], l[1][dv])
			_l.append( [group0, group1, t, p] )

		return DataMatrix(np.array(_l))