Ejemplo n.º 1
0
def vdjdb_beta(q=0, epitopes=False):
    vdjdb = parse_vdjdb(vdjdb_location, q=q)
    beta = vdjdb[['cdr3.beta', 'antigen.epitope']].dropna().drop_duplicates()
    beta.rename(columns={'cdr3.beta':'CDR3',
                         'antigen.epitope':'Epitope'},
                inplace=True)
    if epitopes:
        return beta
    else:
        return beta["CDR3"].drop_duplicates()
Ejemplo n.º 2
0
def vdjdb_paired(q=0, epitopes=False):
    vdjdb = parse_vdjdb(vdjdb_location, q=q)
    paired = vdjdb[['cdr3.alpha', 'cdr3.beta', 'antigen.epitope']].dropna().drop_duplicates()
    paired.rename(columns={'cdr3.alpha':'CDR3_alpha',
                           'cdr3.beta':'CDR3_beta',
                           'antigen.epitope':'Epitope'},
                  inplace=True)
    if epitopes:
        return paired
    else:
        return paired[['CDR3_alpha', 'CDR3_beta']].drop_duplicates()
Ejemplo n.º 3
0
def vdjdb(q=0):
    df = parse_vdjdb(vdjdb_location,q=q)
    return df[["cdr3.beta", "v.beta", "antigen.epitope"]].dropna().drop_duplicates()
Ejemplo n.º 4
0
def vdjdb_tcrdist(filename, q=0):
    prepared_data = parse_vdjdb(filename, q=q)
    prepared_data = prepared_data[['cdr3.beta', 'v.beta']].dropna().drop_duplicates()
    prepared_data.rename(columns={'cdr3.beta':'cdr3_b_aa',
                                  'v.beta':'v_b_gene'})
    return prepared_data
Ejemplo n.º 5
0
def vdjdb_gliph2(filename, q=0):
    prepared_data = parse_vdjdb(filename, q=q)
    prepared_data = prepared_data[['cdr3.beta', 'v.beta']].dropna().drop_duplicates()
    prepared_data.rename(columns={'cdr3.beta':'CDR3',
                                  'v.beta':'V'})
    return prepared_data
Ejemplo n.º 6
0
def TCRDist(data=None, chain='beta', sparse=False):
    '''
    Calculate distances between TCR sequences, using the TCRDist metric
    described in Dash et al. 2017 Nature.

    Parameters
    ----------
    data : pandas.DataFrame, optional
        Input dataframe containing information about
        CDR3 sequence, V gene and antigen specificity. The default is None.
    chain : str, optional
        TCR chain: alpha, beta or paired. The default is 'beta'.
    sparse : Bool, optional
        Turn on sparse distance computation. The default is False.

    Returns
    -------
    S : numpy.array
        TCRDist distance matrix.
    seq : pandas.Series
        pandas.Series with sequences for which distances have been calculated.
    gt : pandas.DataFrame
        Ground truth. pandas.DataFrame containing information about the
        TCR sequence and its cognate epitope target.

    '''
    if data is None:
        vdjdb = parse_vdjdb(
            os.path.abspath('./clustcr/input/vdjdb/vdjdb_full.txt'), q=1)
    else:
        vdjdb = data

    if chain == 'beta':
        cdr3 = 'cdr3_b_aa'
        v_name = 'v_b_gene'
        vdjdb = vdjdb.drop(columns=['cdr3.alpha', 'v.alpha'])
        vdjdb = vdjdb.rename(columns={'cdr3.beta': cdr3, 'v.beta': v_name})
    elif chain == 'alpha':
        cdr3 = 'cdr3_a_aa'
        v_name = 'v_a_gene'
        vdjdb = vdjdb.drop(columns=['cdr3.beta', 'v.beta'])
        vdjdb = vdjdb.rename(columns={'cdr3.alpha': cdr3, 'v.alpha': v_name})

    df_epi = vdjdb[[cdr3, v_name,
                    'antigen.epitope']].dropna().drop_duplicates()
    seq = df_epi.drop(
        columns=['antigen.epitope']).drop_duplicates().reset_index(drop=True)

    gt = df_epi.rename(columns={
        cdr3: 'CDR3',
        v_name: 'V',
        'antigen.epitope': 'Epitope'
    })

    if sparse:

        tr = TCRrep(cell_df=seq,
                    organism='human',
                    chains=['beta'],
                    db_file='alphabeta_gammadelta_db.tsv',
                    compute_distances=False)

        tr.cpus = 2
        tr.compute_sparse_rect_distances(radius=200, chunk_size=500)
        S = tr.rw_beta

    else:

        tr = TCRrep(cell_df=seq,
                    organism='human',
                    chains=[chain],
                    db_file='alphabeta_gammadelta_db.tsv',
                    compute_distances=True)

        S = tr.pw_cdr3_b_aa

    return S, seq, gt