def test_df2listOfSeqs(): ## NAN ending Dt = (None, np.max(te_df['t'])) # one big sequence assert len(aa.df2listOfSeqs(te_df, Dt, time_param='t')[0]) == len(te_df) Dt = (None, 0) # each element by its own assert len(aa.df2listOfSeqs(te_df, Dt, time_param='t')) == len(te_df) ## float ending Dt = (None, np.max(te_df['t'])) # one big sequence te_df2 = te_df[:-1] assert len(aa.df2listOfSeqs(te_df2, Dt, time_param='t')[0]) == len(te_df2) Dt = (None, 0) # each element by its own assert len(aa.df2listOfSeqs(te_df2, Dt, time_param='t')) == len(te_df2)
def dfDict_to_bigram_matrix( df_dict, Dtint, timeLabel="ici", callLabel="call", startTag="_ini", endTag="_end", return_values="probs", minCalls=1, ): """Bigrams counts/probs as matrix from DataFrame Parameters ---------- df_dict: DataFrame Dtint: tuple timeLabel: str callLabel: str return_values: str {probs, counts} Returns ------- (matrix, sampsLi, condsLi) """ cfd = nltk.ConditionalFreqDist() # initialise cond freq dist calls0 = [] for t in df_dict.keys(): # for reach tape thisdf = df_dict[t] # define the sequences sequences = aa.seqsLi2iniEndSeq( aa.df2listOfSeqs(thisdf, Dt=Dtint, l=callLabel, time_param=timeLabel), ini=startTag, end=endTag, ) my_bigrams = nltk.bigrams(sequences) # tag bigrams cfd += bigrams2cfd(my_bigrams) # count bigrams calls0 += list(thisdf[callLabel].values) # calls order calls = [ item[0] for item in sorted( Counter(calls0).items(), key=lambda x: x[1], reverse=True) if item[1] >= minCalls ] # order calls samplesLi = calls[:] + [endTag ] # None #[ 'A', 'B', 'C', 'E', '_ini','_end'] condsLi = calls[:] + [startTag] if return_values == "counts": return kykyCountsDict2matrix(cfd, condsLi, samplesLi) if return_values == "probs": cpd = condFreqDictC2condProbDict(cfd) # , condsLi, samplesLi) return kykyCountsDict2matrix(cpd, condsLi, samplesLi)
def shuffled_cfd(df, Dtint, label='call', time_param='ici'): """returns the conditional frequencies of the bigrams in a df after shuffling <label> Parameters ---------- df : Pandas.DataFrame Dtint : size two list-like label : string name of the label to randomise time_param : string name of the time param (Dtint) to define the sequences Returns ------- cfd_ns h: nltk.ConditionalFrequencyDist counts of the randomised sequences """ sh_df = shuffleSeries(df, shuffleCol=label) # shuffle the calls # define the sequences sequences = aa.seqsLi2iniEndSeq( aa.df2listOfSeqs(sh_df, Dt=Dtint, l=label, time_param=time_param)) my_bigrams = nltk.bigrams(sequences) # detect bigrams cfd_nsh = ngr.bigrams2Dict(my_bigrams) # count bigrams return cfd_nsh
# In[31]: Dt_chunks = 0.1 Dtint_chunks = (None, Dt_chunks) ## useful numbers noteFreqs = Counter( df[call_label].values) # notes = daT.returnSortingKeys(noteFreqs) notes = list(set(df[call_label].values)) ## define the sequences sequences = [] for t in tape_df.keys(): # for each tape this_df = tape_df[t] sequences += aa.df2listOfSeqs(this_df, Dt=Dtint_chunks, l=call_label, time_param=time_param) # define the sequeces ## sequence statistics ngram_seqs = defaultdict(list) for s in sequences: ngram_seqs[len(s)].append(tuple(s)) ## count notes in each sequence size note_chunks_arr = np.zeros((len(noteFreqs), np.max(ngram_seqs.keys()) + 1)) for j in ngram_seqs.keys(): s = ngram_seqs[j] for i, n in enumerate(notes): note_chunks_arr[i, j] = sum(x.count(n) for x in s) seqSizes = np.arange(1, np.shape(note_chunks_arr)[1] + 1) ## sequence stats Ngrams_dist = np.array([len(ngram_seqs[k]) for k in ngram_seqs.keys()])