def test_hc_param(): # Check get_optimal_hc_params returns appropriate parameters mouse_data = data.load_all_features() preped_data = clustering.prep_data(mouse_data) method, dist = clustering.get_optimal_hc_params(preped_data) assert method in ['ward', 'average', 'complete'] assert dist in ['cityblock', 'euclidean', 'chebychev']
def total_amount(strain, mouse, day, feature): """Returns the total amount consumed/moved for one of the following features: food ("F"), water ("W"), and locomotion ("L"). Parameters ---------- strain : int mouse : int day : int feature : {'F', 'W', 'L'} The feature used. 'F' for food, 'W' for water, 'L' for locomotion. Returns ------- The total amount consumed/moved for the given feature in the mouse-day. """ df = data.load_all_features() df = _select_strain_mouse_day_in_data_frame(df, strain, mouse, day) if df.shape[0] == 0: raise ValueError( "No consumption data available for specified mouse-day.") feature_names = {'W': 'Water', 'F': 'Food', 'L': 'Distance'} if feature not in feature_names.keys(): raise ValueError("Feature must be one of :" + str(set(feature_names.keys()))) return df[feature_names[feature]].sum()
def total_amount(strain, mouse, day, feature): """Returns the total amount consumed/moved for one of the following features: food ("F"), water ("W"), and locomotion ("L"). Parameters ---------- strain : int mouse : int day : int feature : {'F', 'W', 'L'} The feature used. 'F' for food, 'W' for water, 'L' for locomotion. Returns ------- The total amount consumed/moved for the given feature in the mouse-day. """ df = data.load_all_features() df = _select_strain_mouse_day_in_data_frame(df, strain, mouse, day) if df.shape[0] == 0: raise ValueError( "No consumption data available for specified mouse-day.") feature_names = {'W': 'Water', 'F': 'Food', 'L': 'Distance'} if feature not in feature_names.keys(): raise ValueError("Feature must be one of :" + str(set(feature_names .keys()))) return df[feature_names[feature]].sum()
def test_fit_kmeans(): # Check get_optimal_fit_kmeans returns expected result mouse_data = data.load_all_features() preped_data = clustering.prep_data(mouse_data) mouse_day_X = preped_data[:, 2:] res = clustering.get_optimal_fit_kmeans( mouse_day_X, num_clusters=range(2, 17), raw=False) assert len(res) == 2 assert len(res[0]) == 15 assert len(res[1][0]) == 170 assert len(set(res[1][14])) <= 16 # silhouette score should be between -1 and 1 assert all(value < 1 for value in res[0]) assert all(value > -1 for value in res[0])
def test_fit_hc(): # Check fit_hc returns appropriate result mouse_data = data.load_all_features() preped_data = clustering.prep_data(mouse_data) mouse_day_X = preped_data[:, 2:] res = clustering.fit_hc(mouse_day_X, "average", "chebychev", num_clusters=range(2, 17)) assert len(res) == 2 assert len(res[0]) == 15 assert len(res[1][0]) == 170 assert len(set(res[1][14])) <= 16 # silhouette score should be between -1 and 1 assert all(value < 1 for value in res[0]) assert all(value > -1 for value in res[0])
def test_fit_kmeans(): # Check get_optimal_fit_kmeans returns expected result mouse_data = data.load_all_features() preped_data = clustering.prep_data(mouse_data) mouse_day_X = preped_data[:, 2:] res = clustering.get_optimal_fit_kmeans(mouse_day_X, num_clusters=range(2, 17), raw=False) assert len(res) == 2 assert len(res[0]) == 15 assert len(res[1][0]) == 170 assert len(set(res[1][14])) <= 16 # silhouette score should be between -1 and 1 assert all(value < 1 for value in res[0]) assert all(value > -1 for value in res[0])
def test_prep_data(): # Check prep_data return the correct dimension mouse_data = data.load_all_features() preped_data = clustering.prep_data(mouse_data) assert preped_data.shape == (170, 20)
def test_all_features_loader(): # Checking load_all_features returns a data frame of the correct dimension all_features = data.load_all_features() assert all_features.shape == (21131, 13)
def test_intervals(): # This is a place holder. Not sure this is correct. all_features = data.load_all_features() assert Intervals(all_features).measure() == 11.0
from mousestyles import data from mousestyles.classification import clustering from mousestyles.visualization import plot_clustering # load data mouse_data = data.load_all_features() # mouse inidividual mouse_dayavgstd_rsl = clustering.prep_data(mouse_data, melted=False, std = True, rescale = True) # optimal parameters method, dist = clustering.get_optimal_hc_params(mouse_day=mouse_dayavgstd_rsl) # fit hc sils_hc, labels_hc = clustering.fit_hc( mouse_day_X=mouse_dayavgstd_rsl[:,2:], method=method, dist=dist, num_clusters=range(2,17)) # plot and get the distance matrxix Z = plot_clustering.plot_dendrogram( mouse_day=mouse_dayavgstd_rsl, method=method, dist=dist)
def test_all_features_loader(): all_features = data.load_all_features() assert_equal(all_features.shape, (21131, 13))
from mousestyles import data from mousestyles.classification import clustering from mousestyles.visualization import plot_clustering # load data mouse_data = data.load_all_features() # mouse inidividual mouse_dayavgstd_rsl = clustering.prep_data(mouse_data, melted=False, std=True, rescale=True) # optimal parameters method, dist = clustering.get_optimal_hc_params(mouse_day=mouse_dayavgstd_rsl) # fit hc sils_hc, labels_hc = clustering.fit_hc(mouse_day_X=mouse_dayavgstd_rsl[:, 2:], method=method, dist=dist, num_clusters=range(2, 17)) # result result = clustering.cluster_in_strain(mouse_dayavgstd_rsl[:, 0], labels_hc[4]) # plot plot_clustering.plot_strain_cluster(count_data=result, groupby_cluster=True)
def aggregate_interval(strain, mouse, feature, bin_width): """ Aggregate the interval data based on n-minute time intervals, return a time series. Parameters ---------- strain: int nonnegative integer indicating the strain number mouse: int nonnegative integer indicating the mouse number feature: {"AS", "F", "M_AS", "M_IS", "W"} "AS": Active state probalibity "F": Food consumed (g) "M_AS": Movement outside homebase "M_IS": Movement inside homebase "W": Water consumed (g) bin_width: number of minutes of time interval for data aggregation Returns ------- ts: pandas.tseries a pandas time series of length 12(day)*24(hour)*60(minute)/n """ # Input Check if (not isinstance(strain, int)) or (strain < 0): raise ValueError('Strain must be a non-negative integer') if (not isinstance(mouse, int)) or (mouse < 0): raise ValueError('Mouse value must be a non-negative integer') if feature not in INTERVAL_FEATURES: raise ValueError( 'Input value must in {"AS", "F", "M_AS", "M_IS", "W"}') if (not isinstance(bin_width, int)) or bin_width < 0 or bin_width > 1440: raise ValueError( 'Bin width (minutes) must be a non-negative integer below 1440') # load data intervals = data.load_intervals(feature) mouse_data = intervals.loc[(intervals['strain'] == strain) & (intervals['mouse'] == mouse)] # build data frame days = sorted(np.unique(mouse_data['day'])) bin_count = int(24 * 60 / bin_width) time_behaviour = np.repeat(0.0, bin_count * len(days)) bin_length = bin_width * 60 for j in days: df = mouse_data.loc[mouse_data['day'] == j] start_end = data.load_start_time_end_time(strain, mouse, j) start = np.asarray(df['start']) - start_end[0] end = np.asarray(df['stop']) - start_end[0] for i in range(len(start)): start_time = start[i] end_time = end[i] start_index = int(start_time / (bin_width * 60)) end_index = int(end_time / (bin_width * 60)) if start_index == end_index: time_behaviour[start_index + j * bin_count] += end_time - start_time elif end_index - start_index == 1: time_behaviour[ start_index + j * bin_count] += bin_length * end_index - start_time time_behaviour[end_index + j * bin_count] += end_time % bin_length else: time_behaviour[start_index + j * bin_count] += bin_length * (start_index + 1) - start_time time_behaviour[end_index + j * bin_count] += end_time % bin_length time_behaviour[start_index + j * bin_count + 1:end_index + j * bin_count] += bin_length if feature == 'F' or feature == 'W': all_feature = data.load_all_features() group = all_feature[[ "strain", "mouse", "day", "hour", "Food", "Water" ]].groupby(["strain", "mouse", "day"]).sum() group = group.reset_index() mouse_data = group.loc[(group['strain'] == strain) & (group['mouse'] == mouse)].copy() mouse_data.loc[:, 'day'] = np.arange(len(mouse_data)) for i in mouse_data['day'].astype('int'): if feature == 'F': food_amount = float(mouse_data['Food'][mouse_data['day'] == i]) time_behaviour[(bin_count * i):(bin_count * (i + 1))] /= sum( time_behaviour[(bin_count * i):(bin_count * (i + 1))]) time_behaviour[(bin_count * i):(bin_count * (i + 1))] *= food_amount else: food_amount = float( mouse_data['Water'][mouse_data['day'] == i]) time_behaviour[(bin_count * i):(bin_count * (i + 1))] /= sum( time_behaviour[(bin_count * i):(bin_count * (i + 1))]) time_behaviour[(bin_count * i):(bin_count * (i + 1))] *= food_amount if feature == 'AS': time_behaviour /= (bin_width * 60) ts = pd.Series(time_behaviour, index=pd.date_range('01/01/2014', periods=len(time_behaviour), freq=str(bin_width) + 'min')) return (ts)
def aggregate_interval(strain, mouse, feature, bin_width): """ Aggregate the interval data based on n-minute time intervals, return a time series. Parameters ---------- strain: int nonnegative integer indicating the strain number mouse: int nonnegative integer indicating the mouse number feature: {"AS", "F", "M_AS", "M_IS", "W"} "AS": Active state probalibity "F": Food consumed (g) "M_AS": Movement outside homebase "M_IS": Movement inside homebase "W": Water consumed (g) bin_width: number of minutes of time interval for data aggregation Returns ------- ts: pandas.tseries a pandas time series of length 12(day)*24(hour)*60(minute)/n """ # Input Check if (not isinstance(strain, int)) or (strain < 0): raise ValueError( 'Strain must be a non-negative integer') if (not isinstance(mouse, int)) or (mouse < 0): raise ValueError( 'Mouse value must be a non-negative integer') if feature not in INTERVAL_FEATURES: raise ValueError( 'Input value must in {"AS", "F", "M_AS", "M_IS", "W"}') if (not isinstance(bin_width, int)) or bin_width < 0 or bin_width > 1440: raise ValueError( 'Bin width (minutes) must be a non-negative integer below 1440') # load data intervals = data.load_intervals(feature) mouse_data = intervals.loc[ (intervals['strain'] == strain) & (intervals['mouse'] == mouse)] # build data frame days = sorted(np.unique(mouse_data['day'])) bin_count = int(24 * 60 / bin_width) time_behaviour = np.repeat(0.0, bin_count * len(days)) bin_length = bin_width * 60 for j in days: df = mouse_data.loc[mouse_data['day'] == j] start_end = data.load_start_time_end_time(strain, mouse, j) start = np.asarray(df['start']) - start_end[0] end = np.asarray(df['stop']) - start_end[0] for i in range(len(start)): start_time = start[i] end_time = end[i] start_index = int(start_time / (bin_width * 60)) end_index = int(end_time / (bin_width * 60)) if start_index == end_index: time_behaviour[start_index + j * bin_count] += end_time - start_time elif end_index - start_index == 1: time_behaviour[ start_index + j * bin_count] += bin_length * end_index - start_time time_behaviour[end_index + j * bin_count] += end_time % bin_length else: time_behaviour[ start_index + j * bin_count] += bin_length * (start_index + 1) - start_time time_behaviour[end_index + j * bin_count] += end_time % bin_length time_behaviour[start_index + j * bin_count + 1:end_index + j * bin_count] += bin_length if feature == 'F' or feature == 'W': all_feature = data.load_all_features() group = all_feature[ ["strain", "mouse", "day", "hour", "Food", "Water"]].groupby( ["strain", "mouse", "day"]).sum() group = group.reset_index() mouse_data = group.loc[(group['strain'] == strain) & (group['mouse'] == mouse)].copy() mouse_data.loc[:, 'day'] = np.arange(len(mouse_data)) for i in mouse_data['day'].astype('int'): if feature == 'F': food_amount = float(mouse_data['Food'][mouse_data['day'] == i]) time_behaviour[ (bin_count * i):(bin_count * (i + 1))] /= sum( time_behaviour[(bin_count * i):(bin_count * (i + 1))]) time_behaviour[(bin_count * i):(bin_count * (i + 1))] *= food_amount else: food_amount = float(mouse_data['Water'][ mouse_data['day'] == i]) time_behaviour[ (bin_count * i):(bin_count * (i + 1))] /= sum( time_behaviour[(bin_count * i):(bin_count * (i + 1))]) time_behaviour[(bin_count * i):(bin_count * (i + 1))] *= food_amount if feature == 'AS': time_behaviour /= (bin_width * 60) ts = pd.Series(time_behaviour, index=pd.date_range( '01/01/2014', periods=len(time_behaviour), freq=str(bin_width) + 'min')) return ts