def test_hc_param():
    # Check get_optimal_hc_params returns appropriate parameters
    mouse_data = data.load_all_features()
    preped_data = clustering.prep_data(mouse_data)
    method, dist = clustering.get_optimal_hc_params(preped_data)
    assert method in ['ward', 'average', 'complete']
    assert dist in ['cityblock', 'euclidean', 'chebychev']
Example #2
0
def total_amount(strain, mouse, day, feature):
    """Returns the total amount consumed/moved for one of the following features:
     food ("F"), water ("W"), and locomotion ("L").

    Parameters
    ----------
    strain : int
    mouse : int
    day : int
    feature : {'F', 'W', 'L'}
        The feature used. 'F' for food, 'W' for water, 'L' for locomotion.

    Returns
    -------
    The total amount consumed/moved for the given feature in the mouse-day.
    """
    df = data.load_all_features()

    df = _select_strain_mouse_day_in_data_frame(df, strain, mouse, day)

    if df.shape[0] == 0:
        raise ValueError(
            "No consumption data available for specified mouse-day.")

    feature_names = {'W': 'Water', 'F': 'Food', 'L': 'Distance'}
    if feature not in feature_names.keys():
        raise ValueError("Feature must be one of :" +
                         str(set(feature_names.keys())))

    return df[feature_names[feature]].sum()
Example #3
0
def total_amount(strain, mouse, day, feature):
    """Returns the total amount consumed/moved for one of the following features:
     food ("F"), water ("W"), and locomotion ("L").

    Parameters
    ----------
    strain : int
    mouse : int
    day : int
    feature : {'F', 'W', 'L'}
        The feature used. 'F' for food, 'W' for water, 'L' for locomotion.

    Returns
    -------
    The total amount consumed/moved for the given feature in the mouse-day.
    """
    df = data.load_all_features()

    df = _select_strain_mouse_day_in_data_frame(df, strain, mouse, day)

    if df.shape[0] == 0:
        raise ValueError(
            "No consumption data available for specified mouse-day.")

    feature_names = {'W': 'Water', 'F': 'Food', 'L': 'Distance'}
    if feature not in feature_names.keys():
        raise ValueError("Feature must be one of :" + str(set(feature_names
                                                              .keys())))

    return df[feature_names[feature]].sum()
def test_hc_param():
    # Check get_optimal_hc_params returns appropriate parameters
    mouse_data = data.load_all_features()
    preped_data = clustering.prep_data(mouse_data)
    method, dist = clustering.get_optimal_hc_params(preped_data)
    assert method in ['ward', 'average', 'complete']
    assert dist in ['cityblock', 'euclidean', 'chebychev']
def test_fit_kmeans():
    # Check get_optimal_fit_kmeans returns expected result
    mouse_data = data.load_all_features()
    preped_data = clustering.prep_data(mouse_data)
    mouse_day_X = preped_data[:, 2:]
    res = clustering.get_optimal_fit_kmeans(
        mouse_day_X, num_clusters=range(2, 17), raw=False)
    assert len(res) == 2
    assert len(res[0]) == 15
    assert len(res[1][0]) == 170
    assert len(set(res[1][14])) <= 16
    # silhouette score should be between -1 and 1
    assert all(value < 1 for value in res[0])
    assert all(value > -1 for value in res[0])
def test_fit_hc():
    # Check fit_hc returns appropriate result
    mouse_data = data.load_all_features()
    preped_data = clustering.prep_data(mouse_data)
    mouse_day_X = preped_data[:, 2:]
    res = clustering.fit_hc(mouse_day_X, "average", "chebychev",
                            num_clusters=range(2, 17))
    assert len(res) == 2
    assert len(res[0]) == 15
    assert len(res[1][0]) == 170
    assert len(set(res[1][14])) <= 16
    # silhouette score should be between -1 and 1
    assert all(value < 1 for value in res[0])
    assert all(value > -1 for value in res[0])
def test_fit_kmeans():
    # Check get_optimal_fit_kmeans returns expected result
    mouse_data = data.load_all_features()
    preped_data = clustering.prep_data(mouse_data)
    mouse_day_X = preped_data[:, 2:]
    res = clustering.get_optimal_fit_kmeans(mouse_day_X,
                                            num_clusters=range(2, 17),
                                            raw=False)
    assert len(res) == 2
    assert len(res[0]) == 15
    assert len(res[1][0]) == 170
    assert len(set(res[1][14])) <= 16
    # silhouette score should be between -1 and 1
    assert all(value < 1 for value in res[0])
    assert all(value > -1 for value in res[0])
def test_fit_hc():
    # Check fit_hc returns appropriate result
    mouse_data = data.load_all_features()
    preped_data = clustering.prep_data(mouse_data)
    mouse_day_X = preped_data[:, 2:]
    res = clustering.fit_hc(mouse_day_X,
                            "average",
                            "chebychev",
                            num_clusters=range(2, 17))
    assert len(res) == 2
    assert len(res[0]) == 15
    assert len(res[1][0]) == 170
    assert len(set(res[1][14])) <= 16
    # silhouette score should be between -1 and 1
    assert all(value < 1 for value in res[0])
    assert all(value > -1 for value in res[0])
def test_prep_data():
    # Check prep_data return the correct dimension
    mouse_data = data.load_all_features()
    preped_data = clustering.prep_data(mouse_data)
    assert preped_data.shape == (170, 20)
Example #10
0
def test_all_features_loader():
    # Checking load_all_features returns a data frame of the correct dimension
    all_features = data.load_all_features()
    assert all_features.shape == (21131, 13)
Example #11
0
def test_intervals():
    # This is a place holder.  Not sure this is correct.
    all_features = data.load_all_features()
    assert Intervals(all_features).measure() == 11.0
from mousestyles import data
from mousestyles.classification import clustering
from mousestyles.visualization import plot_clustering


# load data
mouse_data = data.load_all_features()

# mouse inidividual
mouse_dayavgstd_rsl = clustering.prep_data(mouse_data, melted=False, std = True, rescale = True)

# optimal parameters
method, dist = clustering.get_optimal_hc_params(mouse_day=mouse_dayavgstd_rsl)

# fit hc
sils_hc, labels_hc = clustering.fit_hc(
    mouse_day_X=mouse_dayavgstd_rsl[:,2:],
    method=method, dist=dist, num_clusters=range(2,17))

# plot and get the distance matrxix
Z = plot_clustering.plot_dendrogram(
        mouse_day=mouse_dayavgstd_rsl, method=method, dist=dist)
Example #13
0
def test_prep_data():
    # Check prep_data return the correct dimension
    mouse_data = data.load_all_features()
    preped_data = clustering.prep_data(mouse_data)
    assert preped_data.shape == (170, 20)
Example #14
0
def test_all_features_loader():
    all_features = data.load_all_features()
    assert_equal(all_features.shape, (21131, 13))
Example #15
0
from mousestyles import data
from mousestyles.classification import clustering
from mousestyles.visualization import plot_clustering

# load data
mouse_data = data.load_all_features()

# mouse inidividual
mouse_dayavgstd_rsl = clustering.prep_data(mouse_data,
                                           melted=False,
                                           std=True,
                                           rescale=True)

# optimal parameters
method, dist = clustering.get_optimal_hc_params(mouse_day=mouse_dayavgstd_rsl)

# fit hc
sils_hc, labels_hc = clustering.fit_hc(mouse_day_X=mouse_dayavgstd_rsl[:, 2:],
                                       method=method,
                                       dist=dist,
                                       num_clusters=range(2, 17))

# result
result = clustering.cluster_in_strain(mouse_dayavgstd_rsl[:, 0], labels_hc[4])

# plot
plot_clustering.plot_strain_cluster(count_data=result, groupby_cluster=True)
Example #16
0
def aggregate_interval(strain, mouse, feature, bin_width):
    """
    Aggregate the interval data based on n-minute time
    intervals, return a time series.

    Parameters
    ----------
    strain: int
        nonnegative integer indicating the strain number
    mouse: int
        nonnegative integer indicating the mouse number
    feature: {"AS", "F", "M_AS", "M_IS", "W"}
        "AS": Active state probalibity
        "F": Food consumed (g)
        "M_AS": Movement outside homebase
        "M_IS": Movement inside homebase
        "W": Water consumed (g)
    bin_width: number of minutes of time interval for data aggregation

    Returns
    -------
    ts: pandas.tseries
        a pandas time series of length 12(day)*24(hour)*60(minute)/n
    """
    # Input Check

    if (not isinstance(strain, int)) or (strain < 0):
        raise ValueError('Strain must be a non-negative integer')
    if (not isinstance(mouse, int)) or (mouse < 0):
        raise ValueError('Mouse value must be a non-negative integer')
    if feature not in INTERVAL_FEATURES:
        raise ValueError(
            'Input value must in {"AS", "F", "M_AS", "M_IS", "W"}')
    if (not isinstance(bin_width, int)) or bin_width < 0 or bin_width > 1440:
        raise ValueError(
            'Bin width (minutes) must be a non-negative integer below 1440')

    # load data
    intervals = data.load_intervals(feature)
    mouse_data = intervals.loc[(intervals['strain'] == strain)
                               & (intervals['mouse'] == mouse)]

    # build data frame
    days = sorted(np.unique(mouse_data['day']))
    bin_count = int(24 * 60 / bin_width)
    time_behaviour = np.repeat(0.0, bin_count * len(days))
    bin_length = bin_width * 60

    for j in days:
        df = mouse_data.loc[mouse_data['day'] == j]
        start_end = data.load_start_time_end_time(strain, mouse, j)
        start = np.asarray(df['start']) - start_end[0]
        end = np.asarray(df['stop']) - start_end[0]

        for i in range(len(start)):
            start_time = start[i]
            end_time = end[i]
            start_index = int(start_time / (bin_width * 60))
            end_index = int(end_time / (bin_width * 60))
            if start_index == end_index:
                time_behaviour[start_index +
                               j * bin_count] += end_time - start_time
            elif end_index - start_index == 1:
                time_behaviour[
                    start_index +
                    j * bin_count] += bin_length * end_index - start_time
                time_behaviour[end_index +
                               j * bin_count] += end_time % bin_length
            else:
                time_behaviour[start_index +
                               j * bin_count] += bin_length * (start_index +
                                                               1) - start_time
                time_behaviour[end_index +
                               j * bin_count] += end_time % bin_length
                time_behaviour[start_index + j * bin_count + 1:end_index +
                               j * bin_count] += bin_length

    if feature == 'F' or feature == 'W':
        all_feature = data.load_all_features()
        group = all_feature[[
            "strain", "mouse", "day", "hour", "Food", "Water"
        ]].groupby(["strain", "mouse", "day"]).sum()
        group = group.reset_index()
        mouse_data = group.loc[(group['strain'] == strain)
                               & (group['mouse'] == mouse)].copy()
        mouse_data.loc[:, 'day'] = np.arange(len(mouse_data))
        for i in mouse_data['day'].astype('int'):
            if feature == 'F':
                food_amount = float(mouse_data['Food'][mouse_data['day'] == i])
                time_behaviour[(bin_count * i):(bin_count * (i + 1))] /= sum(
                    time_behaviour[(bin_count * i):(bin_count * (i + 1))])
                time_behaviour[(bin_count * i):(bin_count *
                                                (i + 1))] *= food_amount
            else:
                food_amount = float(
                    mouse_data['Water'][mouse_data['day'] == i])
                time_behaviour[(bin_count * i):(bin_count * (i + 1))] /= sum(
                    time_behaviour[(bin_count * i):(bin_count * (i + 1))])
                time_behaviour[(bin_count * i):(bin_count *
                                                (i + 1))] *= food_amount
    if feature == 'AS':
        time_behaviour /= (bin_width * 60)

    ts = pd.Series(time_behaviour,
                   index=pd.date_range('01/01/2014',
                                       periods=len(time_behaviour),
                                       freq=str(bin_width) + 'min'))

    return (ts)
Example #17
0
def aggregate_interval(strain, mouse, feature, bin_width):
    """
    Aggregate the interval data based on n-minute time
    intervals, return a time series.

    Parameters
    ----------
    strain: int
        nonnegative integer indicating the strain number
    mouse: int
        nonnegative integer indicating the mouse number
    feature: {"AS", "F", "M_AS", "M_IS", "W"}
        "AS": Active state probalibity
        "F": Food consumed (g)
        "M_AS": Movement outside homebase
        "M_IS": Movement inside homebase
        "W": Water consumed (g)
    bin_width: number of minutes of time interval for data aggregation

    Returns
    -------
    ts: pandas.tseries
        a pandas time series of length 12(day)*24(hour)*60(minute)/n
    """
    # Input Check

    if (not isinstance(strain, int)) or (strain < 0):
        raise ValueError(
            'Strain must be a non-negative integer')
    if (not isinstance(mouse, int)) or (mouse < 0):
        raise ValueError(
            'Mouse value must be a non-negative integer')
    if feature not in INTERVAL_FEATURES:
        raise ValueError(
            'Input value must in {"AS", "F", "M_AS", "M_IS", "W"}')
    if (not isinstance(bin_width, int)) or bin_width < 0 or bin_width > 1440:
        raise ValueError(
            'Bin width (minutes) must be a non-negative integer below 1440')

    # load data
    intervals = data.load_intervals(feature)
    mouse_data = intervals.loc[
        (intervals['strain'] == strain) & (intervals['mouse'] == mouse)]

    # build data frame
    days = sorted(np.unique(mouse_data['day']))
    bin_count = int(24 * 60 / bin_width)
    time_behaviour = np.repeat(0.0, bin_count * len(days))
    bin_length = bin_width * 60

    for j in days:
        df = mouse_data.loc[mouse_data['day'] == j]
        start_end = data.load_start_time_end_time(strain, mouse, j)
        start = np.asarray(df['start']) - start_end[0]
        end = np.asarray(df['stop']) - start_end[0]

        for i in range(len(start)):
            start_time = start[i]
            end_time = end[i]
            start_index = int(start_time / (bin_width * 60))
            end_index = int(end_time / (bin_width * 60))
            if start_index == end_index:
                time_behaviour[start_index + j *
                               bin_count] += end_time - start_time
            elif end_index - start_index == 1:
                time_behaviour[
                    start_index + j *
                    bin_count] += bin_length * end_index - start_time
                time_behaviour[end_index + j *
                               bin_count] += end_time % bin_length
            else:
                time_behaviour[
                    start_index + j *
                    bin_count] += bin_length * (start_index + 1) - start_time
                time_behaviour[end_index + j *
                               bin_count] += end_time % bin_length
                time_behaviour[start_index + j * bin_count +
                               1:end_index + j * bin_count] += bin_length

    if feature == 'F' or feature == 'W':
        all_feature = data.load_all_features()
        group = all_feature[
            ["strain", "mouse", "day", "hour", "Food", "Water"]].groupby(
            ["strain", "mouse", "day"]).sum()
        group = group.reset_index()
        mouse_data = group.loc[(group['strain'] == strain) &
                               (group['mouse'] == mouse)].copy()
        mouse_data.loc[:, 'day'] = np.arange(len(mouse_data))
        for i in mouse_data['day'].astype('int'):
            if feature == 'F':
                food_amount = float(mouse_data['Food'][mouse_data['day'] == i])
                time_behaviour[
                    (bin_count * i):(bin_count * (i + 1))] /= sum(
                    time_behaviour[(bin_count * i):(bin_count * (i + 1))])
                time_behaviour[(bin_count * i):(bin_count *
                                                (i + 1))] *= food_amount
            else:
                food_amount = float(mouse_data['Water'][
                                    mouse_data['day'] == i])
                time_behaviour[
                    (bin_count * i):(bin_count * (i + 1))] /= sum(
                    time_behaviour[(bin_count * i):(bin_count * (i + 1))])
                time_behaviour[(bin_count * i):(bin_count *
                                                (i + 1))] *= food_amount
    if feature == 'AS':
        time_behaviour /= (bin_width * 60)

    ts = pd.Series(time_behaviour, index=pd.date_range(
        '01/01/2014', periods=len(time_behaviour),
        freq=str(bin_width) + 'min'))

    return ts