def randomisation_test4bigrmas( df_dict, Dtint, obsStat, Nsh, condsLi, sampsLi, label="call", time_param="ici", testStat=teStat_proportions_diff, ): """one sided randomisation test for each bigram conditional probability under the null hypothesis H0: testStat_observed < testStat_shuffled returns the p-values Parameters ---------- df_dict : dict dictionary of dataframes (tapes) Dt : tuple (None, Dt) obsStat : ndarray observed stat for each bigram Nsh : int condLi, sampLi : list list of conditions and samples testStat : callable Returns ------- p_values : ndarray shuffle_test : ndarray shuffled test distributions """ nr, nc = np.shape(obsStat) shuffle_tests = np.zeros((Nsh, nr, nc)) N_values_r = np.zeros_like(obsStat) for i in range(Nsh): # shuffle ith-loop cfd_sh = nltk.ConditionalFreqDist() # initialise cond freq dist. for t in df_dict.keys(): # for each tape thisdf = df_dict[t] cfd_sh += shuffled_cfd(thisdf, Dtint, label=label, time_param=time_param) # counts Mp_sh, samps, conds = ngr.condFreqDict2condProbMatrix( cfd_sh, condsLi, sampsLi) # normalised matrix shTest_i = testStat(Mp_sh) # compute satat variable shuffle_tests[i] = shTest_i # save distribution for later N_values_r[shTest_i >= obsStat] += 1 # test right # N_values_l[shTest_i < obsStat] += 1 # test left p_r = 1.0 * N_values_r / Nsh return p_r, shuffle_tests
def randomisation_test4bigrmas_inSequences(seqOfSeqs, df_obsStat, Nsh, condsLi, sampsLi, testStat=teStat_proportions_diff): """DEPRECATED: use randtest4bigrmas_inSequences """ ## define array to slice superSequences back into the sequences seq_slicer = superSequenceSlicer( seqOfSeqs) #np.cumsum(np.array([len(item) for item in seqOfSeqs])) ## define super sequence vector superSequence = np.array(flattenList(seqOfSeqs)) ## randomisations test nr, nc = np.shape(df_obsStat) shuffle_tests = np.zeros((Nsh, nr, nc)) N_values_r = np.zeros_like(df_obsStat) for i in np.arange(Nsh): ## randomise supersequence np.random.shuffle(superSequence) ## define sequences: slice supersequence and put in str format for nltk sequences_str = aa.seqsLi2iniEndSeq( sliceBackSuperSequence(superSequence, seq_slicer)) ## split sequences into bigrams my_bigrams = list(nltk.bigrams(sequences_str)) ## count bigrams cfd_sh0 = ngr.bigrams2cfd(my_bigrams) ## fill cfd_sh0 with empty valued keys of the missing values cfd_sh = ngr.fill2KyDict(cfd_sh0, kySet=set(sampsLi) | set(condsLi)) ## transform cfd into matrix form Mp_sh = ngr.condFreqDict2condProbMatrix( cfd_sh, condsLi, sampsLi)[0] # normalised matrix shTest_i = testStat(Mp_sh) # compute satat variable shuffle_tests[i] = shTest_i # save distribution for later N_values_r[shTest_i >= df_obsStat] += 1 # test right # compute p-value p_r = 1.0 * N_values_r / Nsh return p_r, shuffle_tests