def detect_fake_chains(clones_file, Achain='A', Bchain='B'): tcrs = parse_tsv.parse_tsv_file( clones_file, key_fields=[], store_fields=['va_gene', 'cdr3a', 'vb_gene', 'cdr3b']) fake_chains = [] if len(set([(x[0], x[1]) for x in tcrs])) == 1: fake_chains.append(Achain) if len(set([(x[2], x[3]) for x in tcrs])) == 1: fake_chains.append(Bchain) if fake_chains: print 'Fake sequence data detected for chains: {}'.format( ' '.join(fake_chains)) return fake_chains
clones_file_with_nbrdists = '{}_nbrdists.tsv'.format(clones_file[:-4]) header = open(clones_file_with_nbrdists, 'r').readline()[:-1].split('\t') nbrdist_tags = [ x for x in header if x in rand_nbrdist_tags and ('wtd' in x or include_non_wtd) ] nbrdist_tags.sort() num_nbrdist_tags = len(nbrdist_tags) Log('parsing {} for {} nbrdist_tags'.format(clones_file_with_nbrdists, num_nbrdist_tags)) tcr_fields = nbrdist_tags + ['va_genes', 'vb_genes', 'cdr3a', 'cdr3b'] all_tcrs = parse_tsv.parse_tsv_file(clones_file_with_nbrdists, ['epitope'], tcr_fields) ## look for cross-reactive tcrs for e in all_tcrs: new_tcrs = [] for l in all_tcrs[e]: new_tcrs.append( [(nbrdist_rescale * float(x)) for x in l[:num_nbrdist_tags]] + [ frozenset([ cdr3s_human.all_loopseq_representative[organism][y] for y in l[-4].split(';') ]), frozenset([ cdr3s_human.all_loopseq_representative[organism][y] for y in l[-3].split(';') ]), l[-2], l[-1], False
p.flag('showmotifs') p.flag('use_tsne') p.multiword('epitopes').cast(lambda x: x.split()) if pngfile_prefix is None: pngfile_prefix = clones_file[:-4] import matplotlib matplotlib.rcParams['mathtext.default'] = 'regular' if not show: matplotlib.use('Agg') import matplotlib.pyplot as plt greek_ab = {'a': r'$\alpha$', 'b': r'$\beta$'} all_tcrs = parse_tsv.parse_tsv_file(clones_file, ['epitope'], ['clone_id'], True) if showmotifs: motifs_file = clones_file[:-4] + '_motifs.tsv' if not exists(motifs_file): showmotifs = False else: assert exists(motifs_file) all_motifs = parse_tsv.parse_tsv_file( motifs_file, ['epitope', 'chain'], [ 'id', 'showmotif', 'chi_squared', 'matches_with_nbrs', 'matches_with_nbrs_consensus', 'expected_fraction', 'cluster_number', 'is_cluster_center', 'cluster_consensus' ])
if outfile_prefix is None: outfile_prefix = clones_file[:-4] import matplotlib matplotlib.rcParams['mathtext.default'] = 'regular' matplotlib.use('Agg') import matplotlib.pyplot as plt #recompute_nbrdists = True clones_file_with_nbrdists = '{}_nbrdists.tsv'.format(clones_file[:-4]) assert exists(clones_file_with_nbrdists) ## read the epitope-specific TCRs all_tcrs = parse_tsv.parse_tsv_file( clones_file, ['epitope', 'subject'], ['va_genes', 'vb_genes', 'cdr3a', 'cdr3b'], save_l=True) ## last element will be the full parse_tsv_line dictionary if not epitopes: epitopes = all_tcrs.keys()[:] epitopes.sort() Log('reading {}'.format(clones_file_with_nbrdists)) nbrdist_tag_suffix = '_wtd_nbrdist{}'.format(nbrdist_percentile) nbrdist_tags = [ x + '_' + y + nbrdist_tag_suffix for x in epitopes for y in all_chains ] all_nbrdists = parse_tsv.parse_tsv_file(clones_file_with_nbrdists, ['epitope', 'subject'], nbrdist_tags)
footnotes[ab+'_hydro1'] = "Mean CDR3-{} total hydrophobicity (GES scale)".format(ab) #footnotes[ab+'_hydro2'] = "Mean total hydrophobicity (KD scale)" footnotes[ab+'_hydro2'] = "Mean CDR3-{} total hydrophobicity (HP scale)".format(ab) for tag in footnotes: ## no typos assert tag in header_tags all_dats = {} def add_dat( epitope, tag, val ): global all_dats assert tag in header_tags all_dats[ epitope ][ tag ] = val ## parse the clones file all_tcrs = parse_tsv.parse_tsv_file( clones_file, ['epitope','subject'], ['cdr3a','cdr3b','clone_size'] ) epitopes = all_tcrs.keys()[:] epitopes.sort() def get_charge( cdr3 ): return sum( ( aa_charge.get(x,0.0) for x in cdr3 ) ) def get_hp1( cdr3 ): return sum( ( -1*GES.get(x,0.0) for x in cdr3 ) ) def get_hp2( cdr3 ): return sum( ( HP.get(x,0.0) for x in cdr3 ) ) #return sum( ( KD.get(x,0.0) for x in cdr3 ) ) all_scores = {}