Example #1
0
def fit_curve(file_name, num_cdr3s, initial_param_guesses):
    # get data
    sample = data_utils.get_cdr3_counter_from_file(file_name, file_name)
    # get x and y
    x, y = sample.get_x_y_floats(num_cdr3s)

    # define function to fit
    def f(x, s, v_shift_left, piece_boundary, a, b, v_shift_right):

        return np.piecewise(
            x,
            [
                x < piece_boundary,
                x >= piece_boundary,
            ],
            [
                lambda x: (x**-s) / zetac(s) + v_shift_left,
                lambda x: a * x**b + v_shift_right,
            ],
        )

    # fit curve
    fitted_params = curve_fit(f, x, y, p0=initial_param_guesses)[0]
    print('fitted params:', fitted_params)
    # measure error
    score = r2_score(y_true=y, y_pred=f(x, *fitted_params))
    print('R^2 score:', score)
    # plot data and curve
    x_dense = np.linspace(1, len(x), 100)
    y_dense = f(x_dense, *fitted_params)
    plot.scatter_and_func(x, y, x_dense, y_dense)
def find_common_cdr3s(file_names, n=None):
    ''' user facing function '''
    samples = [data_utils.get_cdr3_counter_from_file(f, f) for f in file_names]
    cdr3s = get_common_cdr3s(
        samples,
        limit=n,
    )
    print(cdr3s)
def cluster_cdr3s(file_name, dist_func, n_gram_len=1, n=25):
    ''' user facing function '''
    sample = data_utils.get_cdr3_counter_from_file('s', file_name)
    cdr3s = sample.get_sorted_cdr3s(limit=n)
    ngrammed_dist_func = on_n_gram(n_gram_len)(dist_func)
    hierarchical_clustering_cdr3s(
        cdr3s=cdr3s,
        dist_func=ngrammed_dist_func,
    )
def get_n_closest_cdr3s(file_name, cdr3, n, dist_func, n_gram_len=1):
    ''' user facing function '''
    sample = data_utils.get_cdr3_counter_from_file('s', file_name)
    ngrammed_dist_func = on_n_gram(n_gram_len)(dist_func)
    closest_cdr3s = get_closest_cdr3s_with_frequency(
        cdr3=cdr3,
        sample=sample,
        dist_func=ngrammed_dist_func,
        limit=n,
    )
    pprint(closest_cdr3s)
def preprocess_X(X, cdr3_limit):
    ''' preprocess X '''
    # convert file names to sample vectors
    counters = [data_utils.get_cdr3_counter_from_file(f, f) for f in X]
    # top n CDR3s in samples
    topc_counters = [c.get_sorted_sample(limit=cdr3_limit) for c in counters]
    # weak intersection of CDR3s in samples
    trimmed_counters = Sample.weak_intersection(topc_counters)
    # convert to a dictionary that is compatible with a Pandas DataFrame
    X = {i: c for i, c in enumerate(trimmed_counters)}
    return X
Example #6
0
def calculate_one(cdr3s, file_names, show_legend=True):
    '''
  Calculate a metric on a single distance.
  '''
    # get input data
    counters = [
        data_utils.get_cdr3_counter_from_file(f, f) for f in file_names
    ]
    # calculate
    dates = [data_utils.get_date_from_file_name(f) for f in file_names]
    x, ys = calculate_lifespan(cdr3s, dates, counters)
    # output results
    short_log.info(f'cdr3s={cdr3s}, x={x}, ys={ys}, fnames={file_names}')
    plot.lifespan_graph(cdr3s, x, ys, show_legend)
Example #7
0
def main():
    s = data_utils.get_cdr3_counter_from_file(
        's', 'cdr3.b.A_2019_2020_d_00_20857.ann')
    calculate_one(
        cdr3s=[s.get_cdr3_by_rank(r) for r in range(1, 5)],
        file_names=to_beta([
            'cdr3.a.A_2017_2018_d_00_53535.ann',
            'cdr3.a.A_2017_2018_d_07_11143.ann',
            'cdr3.a.A_2017_2018_d_28_44887.ann',
            'cdr3.a.A_2017_2018_m_04_73516.ann',
            'cdr3.a.A_2019_2020_d_00_20857.ann',
        ]),
        show_legend=True,
    )
    return 'done'
Example #8
0
def main():
    # get data
    f = FILE_NAME
    sample = data_utils.get_cdr3_counter_from_file(f,f)
    # get x and y
    x,y = sample.get_x_y(20)
    # define function to fit
    def f(x, a, v_shift):
        return (x**-a)/zetac(a) + v_shift
    # fit curve
    fitted_params = curve_fit(f, x, y, p0=[6.1, min(y)])[0]
    # plot data and curve
    x_dense = np.linspace(1, len(x), 100)
    y_dense = f(x_dense, *fitted_params)
    plot.scatter_and_func(x, y, x_dense, y_dense)
Example #9
0
def main():
    # get data
    f = 'cdr3.a.A_2017_2018_d_00_53535.ann'
    sample = data_utils.get_cdr3_counter_from_file(f, f)
    # get x and y
    x, y = sample.get_x_y(500)

    # define function to fit
    def f(x, a, b, c):
        return a * x**b + c

    # fit curve
    fitted_params = curve_fit(f, x, y, p0=[60000, -1.1, min(y)])[0]
    print('fitted params:', fitted_params)
    # measure error
    score = r2_score(y_true=y, y_pred=f(x, *fitted_params))
    print('R^2 score:', score)
    # plot data and curve
    x_dense = np.linspace(1, len(x), 100)
    y_dense = f(x_dense, *fitted_params)
    plot.scatter_and_func(x, y, x_dense, y_dense)