Python svg_logoの例、palmotif.svg_logo Pythonの例

コード例 #1

0

ファイルを表示

ファイル: public.py プロジェクト: xzhan50/tcrdist3

def make_motif_logo_from_index(tcrsampler,
                               ind,
                               clone_df,
                               centroid,
                               cdr3_name='cdr3_b_aa',
                               v_name='v_b_gene',
                               gene_names=['v_b_gene', 'j_b_gene']):
    """
	make motif logo from a specific index
	"""
    dfnode = clone_df.iloc[ind, :].copy()

    dfnode[gene_names[0]] = dfnode[gene_names[0]].apply(lambda x: allele_01(x))
    dfnode[gene_names[1]] = dfnode[gene_names[1]].apply(lambda x: allele_01(x))

    gene_usage = dfnode.groupby(gene_names).size()

    sampled_rep = tcrsampler.sample(
        gene_usage.reset_index().to_dict('split')['data'],
        flatten=True,
        depth=100)

    sampled_rep = [x for x in sampled_rep if x is not None]

    motif, stat = compute_pal_motif(seqs=dfnode[cdr3_name],
                                    refs=sampled_rep,
                                    centroid=centroid)

    svg = svg_logo(motif, return_str=True)

    motif_raw, _ = compute_pal_motif(seqs=dfnode[cdr3_name], centroid=centroid)

    svg_raw = svg_logo(motif_raw, return_str=True)

    return svg, svg_raw

コード例 #2

0

ファイルを表示

ファイル: TCRdistMotifDiscovery.py プロジェクト: uio-bmi/immuneML

    def _discover_motif_in_cluster(self, tcr_rep, index, row, negative_examples=None) -> Tuple[List[ReportOutput], List[ReportOutput]]:
        from tcrdist.adpt_funcs import get_centroid_seq
        from tcrdist.summarize import _select

        from palmotif import compute_pal_motif
        from palmotif import svg_logo

        dfnode = tcr_rep.clone_df.iloc[row['neighbors_i'],]
        figure_outputs, table_outputs = [], []

        logging.info(f"{TCRdistMotifDiscovery.__name__}: in cluster {index+1}, there are {dfnode.shape[0]} neighbors.")

        for chain in ['a', 'b']:

            if dfnode.shape[0] > 2:
                centroid, *_ = get_centroid_seq(df=dfnode)
            else:
                centroid = dfnode[f'cdr3_{chain}_aa'].to_list()[0]

            motif, stat = compute_pal_motif(seqs=_select(df=tcr_rep.clone_df, iloc_rows=row['neighbors_i'], col=f'cdr3_{chain}_aa'),
                                            centroid=centroid, refs=negative_examples[chain] if self.use_reference_sequences else None)

            figure_path = self.result_path / f"motif_{chain}_{index + 1}.svg"
            svg_logo(motif, filename=figure_path)

            motif_data_path = self.result_path / f"motif_{chain}_{index + 1}.csv"
            motif.to_csv(motif_data_path)

            figure_outputs.append(ReportOutput(figure_path, f'Motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain)'))
            table_outputs.append(ReportOutput(motif_data_path, f'motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain) csv data'))

        return figure_outputs, table_outputs

コード例 #3

0

ファイルを表示

ファイル: test_olga_synthetic.py プロジェクト: xzhan50/tcrdist3

def motif_creation_human_betas():
    import re
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    import palmotif

    from tcrdist.pgen import OlgaModel
    oma = OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha")

    from tcrdist.pgen import OlgaModel
    omb = OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta")

    df = pd.read_csv("dash_human.csv")
    tr = TCRrep(cell_df=df,
                organism='human',
                chains=['alpha', 'beta'],
                db_file='alphabeta_gammadelta_db.tsv')

    from tcrdist.adpt_funcs import get_basic_centroids
    get_basic_centroids(tr, max_dist=75)
    with open("test_3.svg", 'w') as oh:
        oh.write('<body>')
        for i, r in tr.centroids_df.iterrows():
            if len(r['neighbors']) < 5:
                break
            seqs = tr.clone_df.iloc[r['neighbors'], ]['cdr3_b_aa'].to_list()
            gene_usages = tr.clone_df.iloc[r['neighbors'], ][[
                'v_b_gene', 'j_b_gene'
            ]].value_counts().reset_index().to_dict('split')['data']
            depth = 3

            refs = flatten([
                omb.gen_cdr3s(allele_01(v), allele_01(j), i * depth)
                for v, j, i in combos_alpha
            ])
            refs = [x for x in refs if x is not None]

            matrix, stats = palmotif.compute_pal_motif(seqs=seqs,
                                                       refs=refs,
                                                       centroid=r['cdr3_b_aa'])
            matrix_raw, _ = palmotif.compute_pal_motif(seqs=seqs,
                                                       centroid=r['cdr3_b_aa'])
            refs.append(r['cdr3_b_aa'])
            matrix_bkgd, _ = palmotif.compute_pal_motif(
                seqs=refs, centroid=r['cdr3_b_aa'])

            svgs = [
                palmotif.svg_logo(matrix, 'test.svg', return_str=True),
                palmotif.svg_logo(matrix_raw, 'test.svg', return_str=True),
                palmotif.svg_logo(matrix_bkgd, 'test.svg', return_str=True)
            ]

            [oh.write(f"{s}<div></div>\n") for s in svgs]
            oh.write('<div></div>')
            oh.write(str(r))
            oh.write('<div></div>')

        oh.write('</body>')

コード例 #4

0

ファイルを表示

    def test_props_motif(self):
        dat, pw = generate_peptide_data()
        np.random.seed(110820)
        #pw = pw + np.random.rand(pw.shape[0])

        # pw = scipy.spatial.distance.squareform(distance.pdist(np.random.randn(dat.shape[0], 5))) + pw
        pw = pwsd.apply_pairwise_rect(metric=pwsd.metrics.hamming_distance,
                                      seqs1=dat['seq'])

        res, Z = hcluster_tally(dat,
                                pwmat=pw,
                                x_cols=['trait1'],
                                count_col='count',
                                method='complete')
        res = cluster_association_test(res, method='fishers')

        svg = []
        for i, r in res.iterrows():
            if r['pvalue'] < 0.05:
                m = palmotif.compute_motif(dat['seq'].values[r['neighbors_i']])
                s = palmotif.svg_logo(m,
                                      return_str=True,
                                      return_html=False,
                                      svg_height='500px',
                                      svg_width='500px')
                svg.append(s)
            else:
                svg.append('')
        res = res.assign(motif=svg)

        html = plot_hclust_props(Z,
                                 title='test_props_motif',
                                 tooltip_cols=['motif'],
                                 res=res,
                                 alpha=0.05,
                                 alpha_col='pvalue')

        with open(opj('hierdiff', 'tests', 'test_props_motif.html'),
                  'w',
                  encoding='utf-8') as fh:
            fh.write(html)

        self.assertTrue(True)

コード例 #5

0

ファイルを表示

ファイル: test_gallery_hdiff.py プロジェクト: xzhan50/tcrdist3

def test_gallery_hdiff():
    """
    All imports are provided here, and are repeated 
    step-wise below, for clarity, and for
    module cut-and-paste. This example
    performs paired alpha-beta analysis,
    but code blocks can be used for single
    chain analysis as well.
    """
    import pandas as pd
    from tcrdist.repertoire import TCRrep
    from tcrdist.rep_diff import hcluster_diff, member_summ
    from tcrsampler.sampler import TCRsampler
    from tcrdist.adpt_funcs import get_centroid_seq
    from tcrdist.summarize import _select
    from palmotif import compute_pal_motif, svg_logo
    from hierdiff import plot_hclust_props
    """
    Load a subset of data that contains paired alpha-beta
    chain mouse TCR receptors that recognized 
    the PA or PB1 epitopes (present in mouse influenza). 
    """
    import pandas as pd
    df = pd.read_csv("dash.csv")
    conditional = df['epitope'].apply( lambda x: x in ['PA','PB1'])
    """
    For illustrative/testing purposes, randomly subset the data to include 
    only 100 clones. Increase for more informative plot.
    """
    df = df[conditional].\
        reset_index(drop = True).\
        sample(100, random_state = 3).\
        reset_index(drop = True).\
        copy()
    """
    Load DataFrame into TCRrep instance, 
    which automatically computes attributes:
    1. .clone_df DataFrame
    2. .pw_beta nd.array 
    3. .pw_alpha nd.array 
    """
    from tcrdist.repertoire import TCRrep
    tr = TCRrep(cell_df = df, 
                organism = 'mouse', 
                chains = ['beta','alpha'], 
                db_file = 'alphabeta_gammadelta_db.tsv')

    """
    Apply hcluster_diff, which hierarchically clusters.
    
    Note
    ----
    pwmat could easily be tr.pw_beta or tr.pw_alpha if 
    clustering should be done on a single chain.
    """
    from tcrdist.rep_diff import hcluster_diff
    tr.hcluster_df, tr.Z =\
        hcluster_diff(clone_df = tr.clone_df, 
                      pwmat    = tr.pw_beta + tr.pw_alpha,
                      x_cols = ['epitope'], 
                      count_col = 'count')

    """
    Load a custom background, mouse appropriate dataset to sample CDR3s 
    according to the V and J gene usage frequencies observed in each node.
    See the tcrsampler package for more details 
    (https://github.com/kmayerb/tcrsampler/blob/master/docs/getting_default_backgrounds.md)
    """
    from tcrsampler.sampler import TCRsampler

    t = TCRsampler()
    t.download_background_file("ruggiero_mouse_sampler.zip")
    tcrsampler_beta = TCRsampler(default_background = 'ruggiero_mouse_beta_t.tsv.sampler.tsv')
    tcrsampler_alpha = TCRsampler(default_background = 'ruggiero_mouse_alpha_t.tsv.sampler.tsv')

    """
    Add an SVG graphic to every node of the tree 
    aligned to the cluster centroid.
    """
    from tcrdist.adpt_funcs import get_centroid_seq
    from tcrdist.summarize import _select
    from palmotif import compute_pal_motif, svg_logo

    """Beta Chain"""
    svgs_beta = list()
    for i,r in tr.hcluster_df.iterrows():

        dfnode = tr.clone_df.iloc[r['neighbors_i'],]
        if dfnode.shape[0] > 2:
            centroid, *_ = get_centroid_seq(df = dfnode)
        else:
            centroid = dfnode['cdr3_b_aa'].to_list()[0]
        print(f"BETA-CHAIN: {centroid}")

        gene_usage_beta = dfnode.groupby(['v_b_gene','j_b_gene']).size()
        sampled_rep = tcrsampler_beta.sample( gene_usage_beta.reset_index().to_dict('split')['data'],
                        flatten = True, depth = 10)
        sampled_rep  = [x for x in sampled_rep if x is not None]
        motif, stat = compute_pal_motif(
                        seqs = _select(df = tr.clone_df, 
                                       iloc_rows = r['neighbors_i'], 
                                       col = 'cdr3_b_aa'),
                        refs = sampled_rep, 
                        centroid = centroid)
        
        svgs_beta.append(svg_logo(motif, return_str= True))

    """Add Beta SVG graphics to hcluster_df"""
    tr.hcluster_df['svg_beta'] = svgs_beta


    """Alpha Chain"""
    svgs_alpha = list()
    for i,r in tr.hcluster_df.iterrows():

        dfnode = tr.clone_df.iloc[r['neighbors_i'],]
        if dfnode.shape[0] > 2:
            centroid, *_ = get_centroid_seq(df = dfnode)
        else:
            centroid = dfnode['cdr3_a_aa'].to_list()[0]
        print(f"ALPHA-CHAIN: {centroid}")
        gene_usage_alpha = dfnode.groupby(['v_a_gene','j_a_gene']).size()
        sampled_rep = tcrsampler_alpha.sample( gene_usage_alpha.reset_index().to_dict('split')['data'], 
                        flatten = True, depth = 10)
        
        sampled_rep  = [x for x in sampled_rep if x is not None]
        motif, stat = compute_pal_motif(
                        seqs = _select(df = tr.clone_df, 
                                       iloc_rows = r['neighbors_i'], 
                                       col = 'cdr3_a_aa'),
                        refs = sampled_rep, 
                        centroid = centroid)

        svgs_alpha.append(svg_logo(motif, return_str= True))
    
    """Add Alpha SVG graphics to hcluster_df"""
    tr.hcluster_df['svg_alpha'] = svgs_alpha
    """
    Produce summary information for tooltips. 
    For instance, describe percentage of TCRs with 
    a given epitope at a given node.
    """
    res_summary = member_summ(  res_df = tr.hcluster_df,
                                clone_df = tr.clone_df, 
                                addl_cols=['epitope'])

    tr.hcluster_df_detailed = \
        pd.concat([tr.hcluster_df, res_summary], axis = 1)
    """
    Write D3 html for interactive denogram graphic. 
    Specify desired tooltips.
    """
    from hierdiff import plot_hclust_props
    html = plot_hclust_props(tr.Z,
                title='PA Epitope Example',
                res=tr.hcluster_df_detailed,
                tooltip_cols=['cdr3_b_aa','v_b_gene', 'j_b_gene','svg_alpha','svg_beta'],
                alpha=0.00001, colors = ['blue','gray'],
                alpha_col='pvalue')

    with open('hierdiff_example_PA_v_PB1.html', 'w') as fh:
        fh.write(html)

コード例 #6

0

ファイルを表示

def test_quick_pipeline_with_fragmented_compute():

	"""
	How can I used tcrdist3 to test for TCRs that may HLA restricted. 

	
	"""

	import os
	import pandas as pd
	import numpy as np
	from scipy import sparse
	from tcrdist.repertoire import TCRrep
	from tcrdist.rep_funcs import  compute_pw_sparse_out_of_memory
	
	f = 'mira_epitope_67_382_APHGVVFL_APHGVVFLHV_GVVFLHVTY_VVFLHVTYV.tcrdist3.csv'
	f = os.path.join('tcrdist','data','covid19',f)
	assert os.path.isfile(f)

	df = pd.read_csv(f)
	df = df[['subject', 'cell_type', 'v_b_gene', 'j_b_gene', 'cdr3_b_aa', 'cdr3_b_nucseq',  'cohort', 'hla-a', 'hla-a_1','hla-b', 'hla-b_1']]
	tr = TCRrep(cell_df = df,               
				organism = 'human',
				chains = ['beta'],
				db_file = 'alphabeta_gammadelta_db.tsv',
				compute_distances = False,
				store_all_cdr = False)

	from tcrdist.rep_funcs import  compute_pw_sparse_out_of_memory
	
	S, fragments = compute_pw_sparse_out_of_memory(	tr = tr,
													row_size      = 100,
													pm_processes  = 2,
													pm_pbar       = True,
													max_distance  = 1000,
													matrix_name   = 'rw_beta',
													reassemble    = True,
													cleanup       = False)

	tr.clone_df['B07'] = (tr.clone_df['hla-b'].str.startswith("B*07") | tr.clone_df['hla-b_1'].str.startswith("B*07"))
	tr.clone_df['B07'] = ["B*07" if (x) else "NOTB*07 " for x in tr.clone_df['B07']]

	#sparse.save_npz("S.npz", S)
	from tcrdist.rep_funcs import  compute_n_tally_out_of_memory
	nn_tally_df_cohort = compute_n_tally_out_of_memory(fragments,
												matrix_name = "rw_beta",
												pm_processes  = 6,
												to_file = False,
												to_memory = True, 
												knn_radius = 25, 
												x_cols = ['B07'])

	from hierdiff.association_testing import cluster_association_test
	nn_associations = cluster_association_test(res = nn_tally_df_cohort, y_col='cmember', method='fishers')
	nn_associations = nn_associations.sort_values('pvalue', ascending = True)
	import ast 
	nn_associations['neighbors_i'] = nn_associations.neighbors.apply(lambda x: ast.literal_eval(x))

	from tcrdist.summarize import test_for_almost_subsets, filter_is, filter_gt
	nn_associations['mostly_unique'] = test_for_almost_subsets(nn_associations['neighbors_i'], thr = 5)
	nr_nn_associations = filter_is(nn_associations, 'mostly_unique', 1).copy()

	#nr_nn_associations = filter_gt(nr_nn_associations, 'K_neighbors', 25).copy()
	nr_nn_associations


	# MOTIF GENERATION
	from tcrsampler.sampler import TCRsampler
	t = TCRsampler()
	if  'olga_human_beta_t.sampler.tsv' not in t.currently_available_backgrounds():
		t.download_background_file('olga_sampler.zip')
	#t.download_background_file('olga_sampler.zip') # ONLY IF NOT ALREADY DONE
	tcrsampler_beta = TCRsampler(default_background = 'olga_human_beta_t.sampler.tsv')
	tcrsampler_beta.build_background(max_rows = 1000)

	"""SEE PALMOTIF DOCS (https://github.com/agartland/palmotif)"""
	from palmotif import compute_pal_motif, svg_logo
	from tcrdist.summarize import _select
	
	"""GENERATE SVG GRAPHIC FOR EACH NODE OF THE TREE"""
	#pwmat_str = 'pw_beta'
	cdr3_name = 'cdr3_b_aa'
	gene_names = ['v_b_gene','j_b_gene']
	svgs_beta = list()
	svgs_beta_raw = list()
	info_list = list()

	from tcrdist.rep_diff import member_summ
	summary = member_summ(  res_df = nr_nn_associations,
							clone_df = tr.clone_df,
							addl_cols=['cohort','hla-a', 'hla-a_1', 'hla-b', 'hla-b_1', 'subject'])

	nr_nn_associations = pd.concat([nr_nn_associations, summary], axis = 1).reset_index()

	for i,r in nr_nn_associations.head(25).iterrows():
		dfnode  = tr.clone_df.iloc[r['neighbors_i'],:].copy()
		# <pwnode> Pairwise Matrix for node sequences
		pwnode = S[r['neighbors_i'],:] [:,r['neighbors_i']].todense()
		if dfnode.shape[0] > 2:
			iloc_idx = pwnode.sum(axis = 0).argmin()
			centroid = dfnode[cdr3_name].to_list()[iloc_idx]
		else:
			centroid = dfnode[cdr3_name].to_list()[0]

		print(f"CENTROID: {centroid}")

		gene_usage_beta = dfnode.groupby(gene_names).size()
		sampled_rep = tcrsampler_beta.sample( gene_usage_beta.reset_index().to_dict('split')['data'],
			flatten = True, depth = max(100, 1000 // dfnode.shape[0]))

		sampled_rep  = [x for x in sampled_rep if x is not None]

		motif, stat = compute_pal_motif(
						seqs = _select(df = tr.clone_df,
									   iloc_rows = r['neighbors_i'],
									   col = cdr3_name),
						refs = sampled_rep,
						centroid = centroid)

		svgs_beta.append(svg_logo(motif, return_str= True))

		sampled_rep = sampled_rep.append(centroid)
		motif_raw, _ = compute_pal_motif(
					 seqs =_select(df = tr.clone_df,
									iloc_rows = r['neighbors_i'],
									col = cdr3_name),
					 centroid = centroid)
		svgs_beta_raw.append(svg_logo(motif_raw, return_str= True))
		info_list.append(r)


	def row_to_string(r, vals = ['ct_columns', 'val_0', 'ct_0', 'val_1', 'ct_1', 'val_2', 'ct_2','val_3', 'ct_3', 'levels', 'K_neighbors', 'R_radius', 'RR', 'OR', 'pvalue', 'FWERp','FDRq']):
		#d = {v:r[v] for v in vals}
		return "<br></br>".join([f"\t{v} : {r[v]}" for v in vals])

	def to_html_table(r, vals = ['ct_columns', 'hla-a', 'hla-a_1', 'hla-b', 'hla-b_1', 'val_0', 'ct_0', 'val_2', 'ct_2', 'K_neighbors', 'R_radius', 'pvalue', 'FDRq','cdr3_b_aa','v_b_gene', 'j_b_gene', 'cohort','subject']):
		return pd.DataFrame(r[vals]).transpose().to_html()

	def shrink(html_str):
		return html_str.replace('height="100%"',  'height="10%"').\
			replace('width="100%"', 'width="10%"')

	with open('svgs_in_line.html', 'w') as fh:
		fh.write(f"<html><body>\n")
		

		for svg, svg_raw, details in zip(svgs_beta, svgs_beta_raw, info_list):
			fh.write(f"{shrink(svg_raw)}{shrink(svg)}")
			try:
				fh.write(to_html_table(details))
			except:
				print("F")
			fh.write("<div></div>")
		fh.write(f"</html></body>\n")

コード例 #7

0

ファイルを表示

ファイル: public.py プロジェクト: xzhan50/tcrdist3

def make_motif_logo(tcrsampler,
                    clone_df,
                    pwmat,
                    centroid='CASSPDIEKYF',
                    v_gene='TRBV7-9*01',
                    radius=24,
                    pwmat_str='pw_delta',
                    cdr3_name='cdr3_d_aa',
                    v_name='v_d_gene',
                    gene_names=['v_d_gene', 'j_d_gene']):
    """
	Make a motif from a tcrrep clone_df, pwmat, and a tcrsampler. 

	Parameters
	----------
	tcrsampler : tcrsamper.TCRsampler,
	clone_df : pd.DataFrame,
	pwmat : np.array,
	centroid : str
		e.g.,'CASSPDIEKYF',
	v_gene : str
		e.g. 'TRBV7-9*01',
	radius = int
		e.g., 26,
	pwmat_str : str
		e.g.,'pw_delta',
	cdr3_name : str
		e.g., 'cdr3_d_aa',
	v_name : str
		e.g., 'v_d_gene',
	gene_names : list
		eg., ['v_d_gene','j_d_gene']

	Returns 
	-------
	svg : str
	svg_raw : str

	Notes
	-----
	
	There is a safety first, efficiency loss involved 
	since we are relocating neighbors that 
	may already be know, but by looking 
	up the row index <irow> fisrst matching V,CDR3 this 
	function can be evoked without knowing 
	anything about the positions of 
	the neighbors ahead of time. This is particularly useful 
	since clone_df order is not stable after groupby 
	and deduplication. 
	"""
    irow = clone_df[(clone_df[cdr3_name] == centroid)
                    & (clone_df[v_name] == v_gene)].index[0]

    dfnode = clone_df[pd.Series(pwmat[irow, :]) <= radius].copy()
    dfnode[gene_names[0]] = dfnode[gene_names[0]].apply(lambda x: allele_01(x))
    dfnode[gene_names[1]] = dfnode[gene_names[1]].apply(lambda x: allele_01(x))

    gene_usage = dfnode.groupby(gene_names).size()

    sampled_rep = tcrsampler.sample(
        gene_usage.reset_index().to_dict('split')['data'],
        flatten=True,
        depth=100)

    sampled_rep = [x for x in sampled_rep if x is not None]

    motif, stat = compute_pal_motif(seqs=dfnode[cdr3_name],
                                    refs=sampled_rep,
                                    centroid=centroid)

    svg = svg_logo(motif, return_str=True)

    motif_raw, _ = compute_pal_motif(seqs=dfnode[cdr3_name], centroid=centroid)

    svg_raw = svg_logo(motif_raw, return_str=True)

    return svg, svg_raw

コード例 #8

0

ファイルを表示

ファイル: vdjdb_influenza_hierdiff_eg.py プロジェクト: agartland/ncov_epitopes

    
    print(f"ALPHA-CHAIN CENTROID: {centroid}")
    
    gene_usage_alpha = dfnode.groupby(gene_names).size()
    sampled_rep = tcrsampler_alpha.sample( gene_usage_alpha.reset_index().to_dict('split')['data'], 
                    flatten = True, depth = 10)
    
    sampled_rep  = [x for x in sampled_rep if x is not None]

    motif, stat = compute_pal_motif(
                    seqs = _select(df = tr.clone_df, 
                                   iloc_rows = r['neighbors_i'], 
                                   col = cdr3_name),
                    refs = sampled_rep, 
                    centroid = centroid)
    svgs_alpha.append(svg_logo(motif, return_str= True))

    sampled_rep = sampled_rep.append(centroid)
    motif_raw, _ = compute_pal_motif(
                seqs =_select(df = tr.clone_df, 
                               iloc_rows = r['neighbors_i'], 
                               col = cdr3_name),
                centroid = centroid)
    svgs_alpha_raw.append(svg_logo(motif_raw, return_str= True))  

"""Add Alpha SVG graphics to hcluster_df"""
tr.hcluster_df['svg_alpha'] = svgs_alpha
tr.hcluster_df['svg_alpha_raw'] = svgs_alpha_raw

"""
SUMMARIZE EACH NODE

コード例 #9

0

ファイルを表示

def _tcrsampler_svgs(tcrrep,
                     default_background=None,
                     default_background_if_missing=None,
                     cdr3_name='cdr3_b_aa',
                     pwmat_str='pw_cdr3_b_aa',
                     chain='beta',
                     gene_names=['v_b_gene', 'j_b_gene'],
                     combine_olga=False,
                     verbose=True):
    """
    Breath. What does this do?
    
    Given a TCRrep instance, this function samples a background repertoire
    using TCRsampler and makes svg-logos using palmotif. 

    This function doesn't return anything it. It needs to access 
    attribute values of a TCRrep (tcrrep) instance and 
    it modifies th etcrrep in place adding svgs and stats colums 
    to .hcluster_df_detailed DataFrame. TODO: could just output a dataframe 
    which would then just be concattenated.

    ONLY WORKS WITH _BETA using defaults:

    Notes
    -----
    Note: TCRSampler.build_background() accepts kwargs, we've set these as fixed as most user 
    won't know what these do and won't need to change them.
            max_rows : int
                Maximum clones per v,j pair (per subject)
            stratify_by_subject : bool
                If True, max_rows will apply to v,j,subject. If False, max_rows applies to v,j
            use_frequency : bool
                If True, uses frequency for ranking rows. If False, uses raw counts.
            make_singleton : bool
                If True, background is still sorted by frequency or counts, 
                but final fequency and counts values are overridden
                and set to 1.
    """
    from tcrsampler.sampler import TCRsampler
    from palmotif import compute_pal_motif, svg_logo
    import pandas as pd
    from tcrdist.summarize import _select

    if chain == 'alpha' and tcrrep.organism == "mouse":
        # Here we enforce the rule that alpha-mouse cannot use an olga-sampler
        # TODO: This should be removed as soon as TCRsampler can be updated with a valid
        # mouse-alpha simulated background.
        combine_olga = False

    # _default_sampler returns a TCRSampler based on organism and chain
    if verbose: print(f"INITIALIZING A TCRSAMPLER")
    print(tcrrep.organism, chain)
    t = _default_sampler(organism=tcrrep.organism, chain=chain)(
        default_background=default_background,
        default_background_if_missing=default_background_if_missing)

    build_kargs = {
        'max_rows': 100,
        'stratify_by_subject': True,
        'use_frequency': True,
        'make_singleton': False
    }

    build_kargs_olga = {
        'max_rows': 1000,
        'stratify_by_subject': False,
        'use_frequency': True,
        'make_singleton': False
    }

    if verbose:
        print(f"BUILDING A DEEPER BACKGROUND {report_kwargs(build_kargs)}")

    t.build_background(**build_kargs)
    # Olga Sampler

    if combine_olga:
        t_olga = _default_sampler_olga(chain=chain, organism=tcrrep.organism)()
        if verbose:
            print(
                f"BUILDING A DEEPER BACKGROUND {report_kwargs(build_kargs_olga)}"
            )
        t.build_background(**build_kargs_olga)

        olga_model = {
            ('beta', 'human'):
            OlgaModel(recomb_type="VDJ", chain_folder="human_T_beta"),
            ('alpha', 'human'):
            OlgaModel(recomb_type="VJ", chain_folder="human_T_alpha"),
            ('beta', 'mouse'):
            OlgaModel(recomb_type="VDJ", chain_folder="mouse_T_beta")
        }[(chain, tcrrep.organism)]

    if 'prune' not in tcrrep.hcluster_df.columns:
        if verbose: print("NO PRUNE COLUMNS USED ALL SET TO 0")
        tcrrep.hcluster_df['prune'] = 0

    print("ITERATE THROUGH CLUSTERS")
    svgs = list()
    svgs_raw = list()
    reference_unique = list()
    reference_unique_olga = list()
    reference_size = list()
    reference_size_olga = list()
    percent_missing_sampler = list()
    percent_missing_sampler_olga = list()
    n_rows = tcrrep.hcluster_df.shape[0]

    bar = IncrementalBar(f'Make {chain} SVGs :',
                         max=n_rows,
                         suffix='%(percent)d%%')
    for i, r in tcrrep.hcluster_df.iterrows():
        bar.next()
        if r['prune'] == 0:
            # <dfnode> is dataframe with all the clones at a given tree node
            dfnode = tcrrep.clone_df.iloc[r['neighbors_i'], ].copy()
            # <pwnode> Pairwise Matrix for node sequences
            pwnode = getattr(
                tcrrep,
                pwmat_str)[r['neighbors_i'], :][:, r['neighbors_i']].copy()
            iloc_idx = pwnode.sum(axis=0).argmin()
            centroid = dfnode[cdr3_name].to_list()[iloc_idx]

            # Compute gene usage at the node
            # Convert to allele_01
            for gene_name in gene_names:
                dfnode[gene_name] = dfnode[gene_name].apply(
                    lambda x: allele_01(x))

            gene_usage = dfnode.groupby(
                gene_names).size()  # e.g., ['v_b_gene','j_b_gene']
            gene_usage_tuples = gene_usage.reset_index().to_dict(
                'split')['data']
            # Given gene usage use the <t> a TCRsampler instance to get background seqs

            # Adjust depth for small nodes
            adjust_depth = 10 * round(10 / dfnode.shape[0])
            if adjust_depth < 10:
                adjust_depth = 10

            sampled_rep = t.sample(gene_usage_tuples,
                                   flatten=True,
                                   depth=adjust_depth * 10)

            # Only keep the non-none sequences
            sampled_rep = [x for x in sampled_rep if x is not None]
            # < missing_gene > Count the percentage missing, sampler returns none when no v,j pair is present
            expected_depth = dfnode.shape[0] * adjust_depth * 10
            recovered_depth = len(sampled_rep)
            percent_missing = round(
                100 * (1 - (recovered_depth / expected_depth)), 1)

            percent_missing_sampler.append(f"{percent_missing}%")
            reference_unique.append(str(pd.Series(sampled_rep).nunique()))
            reference_size.append(str(pd.Series(sampled_rep).count()))

            if combine_olga:
                # We modified Olga source code slightly, such that we simulated sequences
                # with a given V,J gene usage
                # OLD METHOD WHERE WE ACTUALLY SAMPLED, slower but can go much deeper. I don't think one rare sequence however, really make a big difference.
                #flatten = lambda l: [item for sublist in l for item in sublist]
                #sampled_rep_olga = [olga_model.gen_cdr3s(allele_01(v),allele_01(j),n*adjust_depth*10) for v,j,n in gene_usage_tuples]
                #sampled_rep_olga = [x for x in flatten(sampled_rep_olga) if x is not None]
                sampled_rep_olga = t_olga.sample(gene_usage_tuples,
                                                 flatten=True,
                                                 depth=adjust_depth * 10)

                sampled_rep_olga = [
                    x for x in sampled_rep_olga if x is not None
                ]

                expected_depth = dfnode.shape[0] * adjust_depth * 10
                recovered_depth = len(sampled_rep_olga)
                percent_missing_olga = round(
                    100 * (1 - (recovered_depth / expected_depth)), 1)

                percent_missing_sampler_olga.append(f"{percent_missing_olga}%")
                reference_unique_olga.append(
                    str(pd.Series(sampled_rep_olga).nunique()))
                reference_size_olga.append(
                    str(pd.Series(sampled_rep_olga).count()))

                # HERE WE COMBINE INTO A SINGLE BACKGROUND:
                sampled_rep = sampled_rep + sampled_rep_olga

            # Get motif matrix and motif stats
            motif, stat = compute_pal_motif(seqs=_select(
                df=tcrrep.clone_df, iloc_rows=r['neighbors_i'], col=cdr3_name),
                                            refs=sampled_rep,
                                            centroid=centroid)

            svgs.append(svg_logo(motif, return_str=True))

            # repeaat without references
            raw_motif, raw_stat = compute_pal_motif(seqs=_select(
                df=tcrrep.clone_df, iloc_rows=r['neighbors_i'], col=cdr3_name),
                                                    centroid=centroid)
            # Convert the motif matrix into an svg_logo, append to list
            svgs_raw.append(svg_logo(raw_motif, return_str=True))
        else:
            # If prune column is 1 don't go to the trouble of sampling and generating seqs
            svgs.append("PRUNE")
            svgs_raw.append("PRUNE")
            reference_size.append("PRUNE")
            reference_unique.append("PRUNE")
            percent_missing_sampler.append("PRUNE")
            percent_missing_sampler_olga.append("PRUNE")
            reference_unique_olga.append("PRUNE")
            reference_size_olga.append("PRUNE")

    bar.next()
    bar.finish()

    # The standard svg_ includes background, whereas raw has no background
    tcrrep.hcluster_df_detailed[f'svg_{chain}'] = svgs
    tcrrep.hcluster_df_detailed[f'svg_raw_{chain}'] = svgs_raw
    tcrrep.hcluster_df_detailed[f'ref_size_{chain}'] = reference_size
    tcrrep.hcluster_df_detailed[f'ref_unique_{chain}'] = reference_unique
    tcrrep.hcluster_df_detailed[
        f'percent_missing_{chain}'] = percent_missing_sampler
    if combine_olga:
        tcrrep.hcluster_df_detailed[
            f'ref_size_olga_{chain}'] = reference_size_olga
        tcrrep.hcluster_df_detailed[
            f'ref_unique_olga_{chain}'] = reference_unique_olga
        tcrrep.hcluster_df_detailed[
            f'percent_missing_olga_{chain}'] = percent_missing_sampler_olga

    return True