def test_df2listOfSeqs():
    ## NAN ending
    Dt = (None, np.max(te_df['t']))  # one big sequence
    assert len(aa.df2listOfSeqs(te_df, Dt, time_param='t')[0]) == len(te_df)

    Dt = (None, 0)  # each element by its own
    assert len(aa.df2listOfSeqs(te_df, Dt, time_param='t')) == len(te_df)

    ## float ending
    Dt = (None, np.max(te_df['t']))  # one big sequence
    te_df2 = te_df[:-1]
    assert len(aa.df2listOfSeqs(te_df2, Dt, time_param='t')[0]) == len(te_df2)

    Dt = (None, 0)  # each element by its own
    assert len(aa.df2listOfSeqs(te_df2, Dt, time_param='t')) == len(te_df2)
Example #2
0
def dfDict_to_bigram_matrix(
    df_dict,
    Dtint,
    timeLabel="ici",
    callLabel="call",
    startTag="_ini",
    endTag="_end",
    return_values="probs",
    minCalls=1,
):
    """Bigrams counts/probs as matrix from DataFrame
    Parameters
    ----------
    df_dict: DataFrame
    Dtint: tuple
    timeLabel: str
    callLabel: str
    return_values: str
        {probs, counts}
    Returns
    -------
    (matrix, sampsLi, condsLi)
    """
    cfd = nltk.ConditionalFreqDist()  # initialise cond freq dist
    calls0 = []
    for t in df_dict.keys():  # for reach tape
        thisdf = df_dict[t]
        # define the sequences
        sequences = aa.seqsLi2iniEndSeq(
            aa.df2listOfSeqs(thisdf,
                             Dt=Dtint,
                             l=callLabel,
                             time_param=timeLabel),
            ini=startTag,
            end=endTag,
        )
        my_bigrams = nltk.bigrams(sequences)  # tag bigrams
        cfd += bigrams2cfd(my_bigrams)  # count bigrams
        calls0 += list(thisdf[callLabel].values)

    # calls order
    calls = [
        item[0] for item in sorted(
            Counter(calls0).items(), key=lambda x: x[1], reverse=True)
        if item[1] >= minCalls
    ]  # order calls
    samplesLi = calls[:] + [endTag
                            ]  # None #[ 'A', 'B', 'C', 'E', '_ini','_end']
    condsLi = calls[:] + [startTag]

    if return_values == "counts":
        return kykyCountsDict2matrix(cfd, condsLi, samplesLi)

    if return_values == "probs":
        cpd = condFreqDictC2condProbDict(cfd)  # , condsLi, samplesLi)
        return kykyCountsDict2matrix(cpd, condsLi, samplesLi)
Example #3
0
def shuffled_cfd(df, Dtint, label='call', time_param='ici'):
    """returns the conditional frequencies
    of the bigrams in a df after shuffling <label>
    Parameters
    ----------
    df : Pandas.DataFrame
    Dtint : size two list-like
    label : string
        name of the label to randomise
    time_param : string
        name of the time param (Dtint) to define the sequences
    Returns
    -------
    cfd_ns h: nltk.ConditionalFrequencyDist
        counts of the randomised sequences
    """
    sh_df = shuffleSeries(df, shuffleCol=label)  # shuffle the calls
    # define the sequences
    sequences = aa.seqsLi2iniEndSeq(
        aa.df2listOfSeqs(sh_df, Dt=Dtint, l=label, time_param=time_param))
    my_bigrams = nltk.bigrams(sequences)  # detect bigrams
    cfd_nsh = ngr.bigrams2Dict(my_bigrams)  # count bigrams
    return cfd_nsh
# In[31]:

Dt_chunks = 0.1
Dtint_chunks = (None, Dt_chunks)

## useful numbers
noteFreqs = Counter(
    df[call_label].values)  # notes = daT.returnSortingKeys(noteFreqs)
notes = list(set(df[call_label].values))
## define the sequences
sequences = []
for t in tape_df.keys():  # for each tape
    this_df = tape_df[t]
    sequences += aa.df2listOfSeqs(this_df,
                                  Dt=Dtint_chunks,
                                  l=call_label,
                                  time_param=time_param)  # define the sequeces
## sequence statistics
ngram_seqs = defaultdict(list)
for s in sequences:
    ngram_seqs[len(s)].append(tuple(s))
## count notes in each sequence size
note_chunks_arr = np.zeros((len(noteFreqs), np.max(ngram_seqs.keys()) + 1))
for j in ngram_seqs.keys():
    s = ngram_seqs[j]
    for i, n in enumerate(notes):
        note_chunks_arr[i, j] = sum(x.count(n) for x in s)

seqSizes = np.arange(1, np.shape(note_chunks_arr)[1] + 1)
## sequence stats
Ngrams_dist = np.array([len(ngram_seqs[k]) for k in ngram_seqs.keys()])