コード例 #1
0
def df2groupedCorpus(df, groupingKey='tape', call='note', sep='\s'):
    """returns the feature list from a dataframe grouping  
    the value of the grouping key
    Parameters
    ----------
    df: pandas dataframe
    groupingKey: str
    call: str
    Returns
    --------
    X_str: list of strings (n_instances)
        feature matrix to use as input of CountVectorizer
    y: list, (n_instances)
        labels of the instances from the grouping key
    """
    ## group dataframe
    df_dict = daT.dictOfGroupedDataFrames(df, groupingKey=groupingKey)
    X_str = []  # feature list
    y = []  # group label
    for ky in df_dict.keys():
        y.append(ky)  # group
        X_str.append(sep.join(df_dict[ky][call].values))
    return X_str, y
コード例 #2
0
import pylotwhale.utils.dataTools as daT
import pylotwhale.NLP.myStatistics_beta as mysts

# # Load csv with sequences

# In[9]:

pDir = os.path.dirname(os.path.abspath(__file__))
cfile = os.path.join(pDir, 'data/sequenceFiles_df.txt')
#cfile = '/home/florencia/profesjonell/bioacoustics/Kurt/mice/data/sequenceFiles_df.txt'

df0 = pd.read_csv(cfile)

# In[13]:

name_df = daT.dictOfGroupedDataFrames(df0, groupingKey='name')
print('TEST', len(name_df))


def test_data():
    assert (len(df0) == 25282)
    assert (len(name_df) == 67)
    assert ({1, 2, 3} == set(df0['genecode']))


# In[14]:

len(name_df)

# # Call from al mice
コード例 #3
0
            df.loc[i, "segment"] = "{}_{}".format(t, j)
            break
        elif col["Dtag"] > 1:  # elements missing -->  new segment
            j += 1
            df.loc[i, "segment"] = "{}_{}".format(t, j)
            continue
        else:
            print("else! ", i, col["Dtag"])
            break

# ### Separate data frames by segment and drop nans
# The nans, come from all the non labelled items in the previous step (segment assignation) and they correspond to missing calls and new tapes.

# In[11]:

tape_df0 = daT.dictOfGroupedDataFrames(df, groupingKey="segment")
# filter df with only one element and segments nan segment
tape_df = {k: v for k, v in tape_df0.items() if (len(v) > 1 and k != np.nan)}

# In[12]:


def test_segment_tape_df0():
    assert len(tape_df0) == 232
    assert len(tape_df) == 175


test_segment_tape_df0()

# ## Distrubution of N-grams as a function of $\tau$
コード例 #4
0
import pylotwhale.NLP.myStatistics_beta as mysts
import pylotwhale.NLP.tempoTools as tT
import pylotwhale.utils.netTools as nT

# # Load df

# In[3]:

pDir = os.path.dirname(os.path.abspath(__file__))

df_file = os.path.join(pDir, 'data/groupB_annotations_df.csv')
#df_file = '/home/florencia/profesjonell/bioacoustics/noriega2018sequences/data/groupB_annotations_df.csv'
# load
df = pd.read_csv(df_file)  #; df= df0
# tape separation
tapedf = daT.dictOfGroupedDataFrames(df)


def test_data():
    # N_calls
    assert (len(df) == 425)
    # N_call types
    assert (len(set(df['call'])) == 22)
    # tapes set
    assert (set(df['tape'].values) == set([113, 114, 115, 111]))
    assert (set(tapedf.keys()) == set([113, 114, 115, 111]))


# # Bigrams  and randomisations test
#
# Define the **sequences**