def fit_curve(file_name, num_cdr3s, initial_param_guesses): # get data sample = data_utils.get_cdr3_counter_from_file(file_name, file_name) # get x and y x, y = sample.get_x_y_floats(num_cdr3s) # define function to fit def f(x, s, v_shift_left, piece_boundary, a, b, v_shift_right): return np.piecewise( x, [ x < piece_boundary, x >= piece_boundary, ], [ lambda x: (x**-s) / zetac(s) + v_shift_left, lambda x: a * x**b + v_shift_right, ], ) # fit curve fitted_params = curve_fit(f, x, y, p0=initial_param_guesses)[0] print('fitted params:', fitted_params) # measure error score = r2_score(y_true=y, y_pred=f(x, *fitted_params)) print('R^2 score:', score) # plot data and curve x_dense = np.linspace(1, len(x), 100) y_dense = f(x_dense, *fitted_params) plot.scatter_and_func(x, y, x_dense, y_dense)
def find_common_cdr3s(file_names, n=None): ''' user facing function ''' samples = [data_utils.get_cdr3_counter_from_file(f, f) for f in file_names] cdr3s = get_common_cdr3s( samples, limit=n, ) print(cdr3s)
def cluster_cdr3s(file_name, dist_func, n_gram_len=1, n=25): ''' user facing function ''' sample = data_utils.get_cdr3_counter_from_file('s', file_name) cdr3s = sample.get_sorted_cdr3s(limit=n) ngrammed_dist_func = on_n_gram(n_gram_len)(dist_func) hierarchical_clustering_cdr3s( cdr3s=cdr3s, dist_func=ngrammed_dist_func, )
def get_n_closest_cdr3s(file_name, cdr3, n, dist_func, n_gram_len=1): ''' user facing function ''' sample = data_utils.get_cdr3_counter_from_file('s', file_name) ngrammed_dist_func = on_n_gram(n_gram_len)(dist_func) closest_cdr3s = get_closest_cdr3s_with_frequency( cdr3=cdr3, sample=sample, dist_func=ngrammed_dist_func, limit=n, ) pprint(closest_cdr3s)
def preprocess_X(X, cdr3_limit): ''' preprocess X ''' # convert file names to sample vectors counters = [data_utils.get_cdr3_counter_from_file(f, f) for f in X] # top n CDR3s in samples topc_counters = [c.get_sorted_sample(limit=cdr3_limit) for c in counters] # weak intersection of CDR3s in samples trimmed_counters = Sample.weak_intersection(topc_counters) # convert to a dictionary that is compatible with a Pandas DataFrame X = {i: c for i, c in enumerate(trimmed_counters)} return X
def calculate_one(cdr3s, file_names, show_legend=True): ''' Calculate a metric on a single distance. ''' # get input data counters = [ data_utils.get_cdr3_counter_from_file(f, f) for f in file_names ] # calculate dates = [data_utils.get_date_from_file_name(f) for f in file_names] x, ys = calculate_lifespan(cdr3s, dates, counters) # output results short_log.info(f'cdr3s={cdr3s}, x={x}, ys={ys}, fnames={file_names}') plot.lifespan_graph(cdr3s, x, ys, show_legend)
def main(): s = data_utils.get_cdr3_counter_from_file( 's', 'cdr3.b.A_2019_2020_d_00_20857.ann') calculate_one( cdr3s=[s.get_cdr3_by_rank(r) for r in range(1, 5)], file_names=to_beta([ 'cdr3.a.A_2017_2018_d_00_53535.ann', 'cdr3.a.A_2017_2018_d_07_11143.ann', 'cdr3.a.A_2017_2018_d_28_44887.ann', 'cdr3.a.A_2017_2018_m_04_73516.ann', 'cdr3.a.A_2019_2020_d_00_20857.ann', ]), show_legend=True, ) return 'done'
def main(): # get data f = FILE_NAME sample = data_utils.get_cdr3_counter_from_file(f,f) # get x and y x,y = sample.get_x_y(20) # define function to fit def f(x, a, v_shift): return (x**-a)/zetac(a) + v_shift # fit curve fitted_params = curve_fit(f, x, y, p0=[6.1, min(y)])[0] # plot data and curve x_dense = np.linspace(1, len(x), 100) y_dense = f(x_dense, *fitted_params) plot.scatter_and_func(x, y, x_dense, y_dense)
def main(): # get data f = 'cdr3.a.A_2017_2018_d_00_53535.ann' sample = data_utils.get_cdr3_counter_from_file(f, f) # get x and y x, y = sample.get_x_y(500) # define function to fit def f(x, a, b, c): return a * x**b + c # fit curve fitted_params = curve_fit(f, x, y, p0=[60000, -1.1, min(y)])[0] print('fitted params:', fitted_params) # measure error score = r2_score(y_true=y, y_pred=f(x, *fitted_params)) print('R^2 score:', score) # plot data and curve x_dense = np.linspace(1, len(x), 100) y_dense = f(x_dense, *fitted_params) plot.scatter_and_func(x, y, x_dense, y_dense)