def test_feature_load_input(): # checking functions raise the correct errors with pytest.raises(ValueError) as excinfo: data.load_intervals('A') msg = 'Input value must be one of {"AS", "F", "IS", "M_AS", "M_IS", "W"}' assert excinfo.value.args[0] == msg with pytest.raises(ValueError) as excinfo: data.load_movement(-1, 0, 0) assert excinfo.value.args[0] == "Input values need to be nonnegative" with pytest.raises(TypeError) as excinfo: data.load_movement(0.0, 0, 0) assert excinfo.value.args[0] == "Input values need to be integer"
def getdistance(strain, mouse, day): """ Return the distance of each two consecutive points among coordinates which is bigger than 1cm(truncated). Parameters ---------- strain : int the strain number of the mouse mouse : int the mouse number in its strain day : int the day number Returns ------- cut_dist : an array of number The vector of truncated distance. Examples -------- >>> getdistance (0, 0, 0) array([ 1.00648944, 1.02094319, 1.0178885 , ..., 1.00099351, 1.01191156, 1.00423354]) """ df = data.load_movement(strain, mouse, day) xcood = df["x"] ycood = df["y"] distance_vector = np.sqrt(np.diff(xcood)**2 + np.diff(ycood)**2) msk = distance_vector > 1 cut_dist = distance_vector[msk] return (cut_dist)
def get_travel_distances(strain=0, mouse=0, day=0): """ Get distances travelled in 20ms for this strain, this mouse, on this day. Parameters ---------- strain: int {0, 1, 2} The strain of mouse to test mouse: int {0, 1, 2, 3} The mouse twin id with in the strain day: int {0, 1, ..., 11} The day to calculate the distance Returns ------- x: np.ndarray shape (n, 1) The distances travelled in 20ms for this mouse on this day, truncated at 1cm (i.e. only record mouse movement when it moves more than 1cm) Examples: >>> get_travel_distances(0, 0, 0)[:3] array([ 1.00648944, 1.02094319, 1.0178885 ]) """ df = load_movement(strain=strain, mouse=mouse, day=day) x = np.array(np.sqrt(df.x.diff()**2 + df.y.diff()**2))[1:] x = x[x >= 1] return x
def get_travel_distances(strain=0, mouse=0, day=0): """ Get distances travelled in 20ms for this strain, this mouse, on this day. Parameters ---------- strain: int {0, 1, 2} The strain of mouse to test mouse: int {0, 1, 2, 3} The mouse twin id with in the strain day: int {0, 1, ..., 11} The day to calculate the distance Returns ------- x: np.ndarray shape (n, 1) The distances travelled in 20ms for this mouse on this day, truncated at 1cm (i.e. only record mouse movement when it moves more than 1cm) Examples: >>> get_travel_distances(0, 0, 0)[:3] array([ 1.00648944, 1.02094319, 1.0178885 ]) """ df = load_movement(strain=strain, mouse=mouse, day=day) x = np.array(np.sqrt(df.x.diff()**2 + df.y.diff()**2))[1:] x = x[x >= 1] return x
def test_load_movement_and_intervals(): m1 = data.load_movement(1, 1, 1) m2 = data.load_movement_and_intervals( 1, 1, 1, []) # don't add any features assert np.all(m1 == m2) m3 = data.load_movement_and_intervals(1, 1, 1, ['AS']) m4 = data.load_movement_and_intervals(1, 1, 1, 'AS') assert m3.shape[1] == m1.shape[1] + 1 # adds one column assert m3.shape[0] == m1.shape[0] # same number of rows assert np.all(m3 == m4)
def test_load_movement_and_intervals(): m1 = data.load_movement(1, 1, 1) m2 = data.load_movement_and_intervals( 1, 1, 1, []) # don't add any features assert np.all(m1 == m2) m3 = data.load_movement_and_intervals(1, 1, 1, ['AS']) m4 = data.load_movement_and_intervals(1, 1, 1, 'AS') assert m3.shape[1] == m1.shape[1] + 1 # adds one column assert m3.shape[0] == m1.shape[0] # same number of rows assert np.all(m3 == m4)
def test_filter_path_input(): movement = data.load_movement(0, 0, 0) paths = path_diversity.path_index(movement, 1, 1) # checking functions raise the correct errors # input negative number with pytest.raises(ValueError) as excinfo: path_diversity.filter_path(movement, paths, -1) assert excinfo.value.args[0] == "Input values need to be positive" # input zeros with pytest.raises(ValueError) as excinfo: path_diversity.filter_path(movement, paths, 0) assert excinfo.value.args[0] == "Input values need to be positive"
def test_path_input(): movement = data.load_movement(0, 0, 0) # checking functions raise the correct errors # input negative number with pytest.raises(ValueError) as excinfo: path_index(movement, -1, -1) assert excinfo.value.args[0] == "Input values need to be positive" # input zeros with pytest.raises(ValueError) as excinfo: path_index(movement, 0, 0) assert excinfo.value.args[0] == "Input values need to be positive" # min_path_length cannot be floating number with pytest.raises(TypeError) as excinfo: path_index(movement, 1, 1.5) assert excinfo.value.args[0] == "min_path_length needs to be integer"
def test_dist_speed_input(): movement = data.load_movement(0, 0, 0) # Check if function raises the correct type of errors. # Input negative numbers with pytest.raises(ValueError) as excinfo: path_diversity.get_dist_speed(movement, -1, -1) assert excinfo.value.args[0] == "Start and end indices must be positive" # Input non-integers with pytest.raises(TypeError) as excinfo: path_diversity.get_dist_speed(movement, 0.1, 0.1) assert excinfo.value.args[0] == "Start and end indices must be integers" # Input start index greater than end index with pytest.raises(ValueError) as excinfo: path_diversity.get_dist_speed(movement, 500, 2) assert excinfo.value.args[ 0] == "Start index must be smaller than end index" # Input indices that encompass data outside of true data length with pytest.raises(ValueError) as excinfo: path_diversity.get_dist_speed(movement, 0, len(movement)) assert excinfo.value.args[0] == "Number of observations must be less than \
def test_dist_speed_input(): movement = data.load_movement(0, 0, 0) # Check if function raises the correct type of errors. # Input negative numbers with pytest.raises(ValueError) as excinfo: get_dist_speed.get_dist_speed(movement, -1, -1) assert excinfo.value.args[0] == "Start and end indices must be positive" # Input non-integers with pytest.raises(TypeError) as excinfo: get_dist_speed.get_dist_speed(movement, 0.1, 0.1) assert excinfo.value.args[0] == "Start and end indices must be integers" # Input start index greater than end index with pytest.raises(ValueError) as excinfo: get_dist_speed.get_dist_speed(movement, 500, 2) assert excinfo.value.args[ 0] == "Start index must be smaller than end index" # Input indices that encompass data outside of true data length with pytest.raises(ValueError) as excinfo: get_dist_speed.get_dist_speed(movement, 0, len(movement)) assert excinfo.value.args[0] == "Number of observations must be less than \
def test_detect_noise_input(): movement = data.load_movement(0, 0, 0) paths = path_diversity.path_index(movement, 1, 1) # Check if function raises the correct type of errors. # Input negative angle_threshold with pytest.raises(ValueError) as excinfo: path_diversity.detect_noise(movement, paths, -1, 1) assert excinfo.value.args[0] == "Input values need to be positive" # Input negative delta_t with pytest.raises(ValueError) as excinfo: path_diversity.detect_noise(movement, paths, 1, -1) assert excinfo.value.args[0] == "Input values need to be positive" # Input zero angle_threshold with pytest.raises(ValueError) as excinfo: path_diversity.detect_noise(movement, paths, 0, 1) assert excinfo.value.args[0] == "Input values need to be positive" # Input zero delta_t with pytest.raises(ValueError) as excinfo: path_diversity.detect_noise(movement, paths, 1, 0) assert excinfo.value.args[0] == "Input values need to be positive"
def test_path(): movement = data.load_movement(0, 0, 0) # Checking functions output the correct path paths = path_index(movement, 1, 1) assert paths[:5] == [[22, 53], [55, 59], [67, 89], [91, 95], [96, 114]]
import numpy as np import matplotlib.pyplot as plt from mousestyles.data import load_movement from mousestyles.path_diversity import path_index movement = load_movement(0, 0, 0) paths = path_index(movement, 1, 1) xlim = [-16.25, 3.75] ylim = [1.0, 43.0] for sep in paths: path = movement[sep[0]:sep[1] + 1] plt.plot(path['x'], path['y'], 'b', linewidth=1, alpha=.1) plt.xlabel('x-coordinate') plt.xlim(xlim[0], xlim[1]) plt.ylabel('y-coordinate') plt.ylim(ylim[0], ylim[1]) plt.title("Example of path plot") plt.show()
def hypo_powerLaw_null(strain, mouse, day, law_est=0, seed=-1): """ Return the outcome from GLRT with null hypothesis law distribution. Description ----------- This function used the Generalized Likelihood Ratio Test to test the goodness of fit: in other words, which distribution is more likely. In this function, we choose the powerLaw distributin to be the null and exponential distribution to be the alternative. We derived the test statistics by theory and pluged in MLE as our estimation of best parameters. After we calculated the paramters, we need to find the rejection region, critical value or pvalue. To get a more general test, we want to use pvalue, instead of critical value under certain significance level. To find the p-value, we use simulation methods, and all random numbers are drawn from previous functions. Therefore, although p value should be a constant given data, it is not a constant in our function, if we did not set the seed. In general, in this function, if the p value is too small, then we will reject the null, and we say powerlaw is not a better fit compared to exponential distribution. Parameters ---------- strain : int the strain number of the mouse mouse : int the mouse number in its strain day : int the day number law_est: double (optional) the estimated parameter in law distribution Returns ------- p_value: the probablity under null reject. Examples -------- >>> hypo_law_null (0, 0, 0) 0.0070000000000000001 """ if seed != -1: np.random.seed(seed) df = data.load_movement(strain, mouse, day) xcood = df["x"] ycood = df["y"] distance_vector = np.sqrt(np.diff(xcood)**2 + np.diff(ycood)**2) msk = distance_vector > 1 cut_dist = distance_vector[msk] if law_est == 0: law_est = 1 + len(cut_dist) * 1 / \ (np.sum(np.log(cut_dist / np.min(cut_dist)))) n = len(cut_dist) log_cut = np.log(cut_dist) sum_cut = np.sum(log_cut) test_stat = n * (np.log(sum_cut - n) - np.log(sum_cut)) - law_est * sum_cut sample_stat = [] for i in range(1000): sample = random_powerlaw(len(cut_dist), law_est) sum_sam = np.sum(sample) log_sam = np.log(sample) sum_log_sam = np.log(np.sum(log_sam)) tmp = n * (np.log(sum_sam - n) - sum_log_sam) - law_est * np.sum(log_sam) sample_stat.append(tmp) # critical_value = ss.mstats.mquantiles(sample_stat, prob = 0.05)[0] p_value = np.sum(sample_stat > test_stat) / len(sample_stat) return (p_value)
def test_filter_path(): movement = data.load_movement(0, 0, 0) paths = path_index(movement, 1, 1) # Checking functions output the correct path pass_paths = filter_path.filter_paths(movement, paths, 20) assert pass_paths == [[3082, 3181], [30835, 30970], [31346, 31557]]
def hypo_powerLaw_null(strain, mouse, day, law_est=0, seed=-1): """ Return the outcome from GLRT with null hypothesis law distribution. This function used the Generalized Likelihood Ratio Test to test the goodness of fit: in other words, which distribution is more likely. In this function, we choose the powerLaw distributin to be the null and exponential distribution to be the alternative. We derived the test statistics by theory and pluged in MLE as our estimation of best parameters. After we calculated the paramters, we need to find the rejection region, critical value or pvalue. To get a more general test, we want to use pvalue, instead of critical value under certain significance level. To find the p-value, we use simulation methods, and all random numbers are drawn from previous functions. Therefore, although p value should be a constant given data, it is not a constant in our function, if we did not set the seed. In general, in this function, if the p value is too small, then we will reject the null, and we say powerlaw is not a better fit compared to exponential distribution. Parameters ---------- strain : int the strain number of the mouse mouse : int the mouse number in its strain day : int the day number law_est: double (optional) the estimated parameter in law distribution Returns ------- p_value: the probablity under null reject. Examples -------- >>> hypo_law_null (0, 0, 0) 0.0070000000000000001 """ if seed != -1: np.random.seed(seed) df = data.load_movement(strain, mouse, day) xcood = df["x"] ycood = df["y"] distance_vector = np.sqrt(np.diff(xcood)**2 + np.diff(ycood)**2) msk = distance_vector > 1 cut_dist = distance_vector[msk] if law_est == 0: law_est = 1 + len(cut_dist) * 1 / \ (np.sum(np.log(cut_dist / np.min(cut_dist)))) n = len(cut_dist) log_cut = np.log(cut_dist) sum_cut = np.sum(log_cut) test_stat = n * (np.log(sum_cut - n) - np.log(sum_cut)) - law_est * sum_cut sample_stat = [] for i in range(1000): sample = random_powerlaw(len(cut_dist), law_est) sum_sam = np.sum(sample) log_sam = np.log(sample) sum_log_sam = np.log(np.sum(log_sam)) tmp = n * (np.log(sum_sam - n) - sum_log_sam) - \ law_est * np.sum(log_sam) sample_stat.append(tmp) # critical_value = ss.mstats.mquantiles(sample_stat, prob = 0.05)[0] p_value = np.sum(sample_stat > test_stat) / len(sample_stat) return p_value
def test_movement_loader(): # Checking load_movement returns a data frame of the correct dimension movement = data.load_movement(0, 0, 0) assert movement.shape == (39181, 4)
def aggregate_movement(strain, mouse, bin_width): """ Aggregate the movement data based on n-minute time intervals, return a time series. Parameters ---------- strain: int nonnegative integer indicating the strain number mouse: int nonnegative integer indicating the mouse number bin_width: number of minutes of time interval for data aggregation Returns ------- ts: pandas.tseries a pandas time series of length (#day)*24(hour)*60(minute)/n """ # Input Check if (not isinstance(strain, int)) or (strain < 0): raise ValueError('Strain must be a non-negative integer') if (not isinstance(mouse, int)) or (mouse < 0): raise ValueError('Mouse value must be a non-negative integer') if (not isinstance(bin_width, int)) or bin_width < 0 or bin_width > 1440: raise ValueError( 'Bin width (minutes) must be a non-negative integer below 1440') # determine number of days intervals = data.load_intervals('IS') mouse_data = intervals.loc[(intervals['strain'] == strain) & (intervals['mouse'] == mouse)] days = sorted(np.unique(mouse_data['day'])) # build data frame bin_count = int(24 * 60 / bin_width) time_movements = np.repeat(0.0, bin_count * len(days)) bin_length = bin_width * 60 for j in days: M = data.load_movement(strain, mouse, day=int(j)) distance_df = pd.DataFrame({ "start": M["t"].values[0:-1], "end": M["t"].values[1:], "distance": np.linalg.norm(M[["x", "y"]].values[1:] - M[["x", "y"]].values[0:-1], axis=1) }) start_end = data.load_start_time_end_time(strain, mouse, j) start = np.asarray(distance_df['start']) - start_end[0] end = np.asarray(distance_df['end']) - start_end[0] dist = distance_df['distance'] for i in range(len(start)): start_time = start[i] end_time = end[i] start_index = int(start_time / (bin_width * 60)) end_index = int(end_time / (bin_width * 60)) if start_index == end_index: time_movements[start_index + j * bin_count] += dist[i] else: time_movements[ end_index + j * bin_count] += end_time % \ bin_length / (end_time - start_time) * dist[i] time_movements[ start_index + j * bin_count] += dist[i] - \ end_time % bin_length / (end_time - start_time) * dist[i] ts = pd.Series(time_movements, index=pd.date_range('01/01/2014', periods=len(time_movements), freq=str(bin_width) + 'min')) return (ts)
def aggregate_movement(strain, mouse, bin_width): """ Aggregate the movement data based on n-minute time intervals, return a time series. Parameters ---------- strain: int nonnegative integer indicating the strain number mouse: int nonnegative integer indicating the mouse number bin_width: number of minutes of time interval for data aggregation Returns ------- ts: pandas.tseries a pandas time series of length (#day)*24(hour)*60(minute)/n """ # Input Check if (not isinstance(strain, int)) or (strain < 0): raise ValueError( 'Strain must be a non-negative integer') if (not isinstance(mouse, int)) or (mouse < 0): raise ValueError( 'Mouse value must be a non-negative integer') if (not isinstance(bin_width, int)) or bin_width < 0 or bin_width > 1440: raise ValueError( 'Bin width (minutes) must be a non-negative integer below 1440') # determine number of days intervals = data.load_intervals('IS') mouse_data = intervals.loc[ (intervals['strain'] == strain) & (intervals['mouse'] == mouse)] days = sorted(np.unique(mouse_data['day'])) # build data frame bin_count = int(24 * 60 / bin_width) time_movements = np.repeat(0.0, bin_count * len(days)) bin_length = bin_width * 60 for j in days: M = data.load_movement(strain, mouse, day=int(j)) distance_df = pd.DataFrame({"start": M["t"].values[0:-1], "end": M["t"].values[1:], "distance": np.linalg.norm(M[["x", "y"]].values[1:] - M[["x", "y"]].values[0:-1], axis=1)}) start_end = data.load_start_time_end_time(strain, mouse, j) start = np.asarray(distance_df['start']) - start_end[0] end = np.asarray(distance_df['end']) - start_end[0] dist = distance_df['distance'] for i in range(len(start)): start_time = start[i] end_time = end[i] start_index = int(start_time / (bin_width * 60)) end_index = int(end_time / (bin_width * 60)) if start_index == end_index: time_movements[start_index + j * bin_count] += dist[i] else: time_movements[ end_index + j * bin_count] += end_time % \ bin_length / (end_time - start_time) * dist[i] time_movements[ start_index + j * bin_count] += dist[i] - \ end_time % bin_length / (end_time - start_time) * dist[i] ts = pd.Series(time_movements, index=pd.date_range( '01/01/2014', periods=len(time_movements), freq=str(bin_width) + 'min')) return ts