def setline(self, irandom=None): # don't access <self.line> directly if self.line is not None: return self.line line = {} for region in utils.regions: line[region + '_gene'] = self.genes[region] for boundary in utils.boundaries: line[boundary + '_insertion'] = self.insertions[boundary] for boundary in utils.effective_boundaries: line[boundary + '_insertion'] = '' # NOTE 'fv' and 'jf' insertions are hereby hardcoded to zero (I'm just writing this here to make it easily searchable -- I don't remember why it's set up that way) for erosion in utils.real_erosions: line[erosion + '_del'] = self.erosions[erosion] for erosion in utils.effective_erosions: line[erosion + '_del'] = self.effective_erosions[erosion] line['input_seqs'] = self.final_seqs line['indelfos'] = self.indelfos line['seqs'] = [self.indelfos[iseq]['reversed_seq'] if indelutils.has_indels(self.indelfos[iseq]) else line['input_seqs'][iseq] for iseq in range(len(line['input_seqs']))] self.set_ids(line, irandom=irandom) treeutils.translate_labels(self.tree, zip(self.leaf_names, line['unique_ids'])) # ordering in <self.leaf_names> is set in recombinator.add_mutants() line['affinities'] = [None for _ in line['unique_ids']] line['tree'] = self.tree.as_string(schema='newick') utils.add_implicit_info(self.glfo, line) self.line = line
def setline(self, irandom=None): # don't access <self.line> directly if self.line is not None: return self.line line = {} for region in utils.regions: line[region + '_gene'] = self.genes[region] for boundary in utils.boundaries: line[boundary + '_insertion'] = self.insertions[boundary] for boundary in utils.effective_boundaries: line[boundary + '_insertion'] = '' for erosion in utils.real_erosions: line[erosion + '_del'] = self.erosions[erosion] for erosion in utils.effective_erosions: line[erosion + '_del'] = self.effective_erosions[erosion] line['input_seqs'] = self.final_seqs line['indelfos'] = self.indelfos line['seqs'] = [ line['indelfos'][iseq]['reversed_seq'] if indelutils.has_indels( line['indelfos'][iseq]) else line['input_seqs'][iseq] for iseq in range(len(line['input_seqs'])) ] self.set_ids(line, irandom) utils.add_implicit_info(self.glfo, line) self.line = line
def print_event(self): line = {} # collect some information into a form that the print fcn understands for region in utils.regions: line[region + '_gene'] = self.genes[region] for boundary in utils.boundaries: line[boundary + '_insertion'] = self.insertions[boundary] for erosion in utils.real_erosions: line[erosion + '_del'] = self.erosions[erosion] for erosion in utils.effective_erosions: line[erosion + '_del'] = self.effective_erosions[erosion] assert 'fv_insertion' not in line # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things assert 'jf_insertion' not in line line['fv_insertion'] = '' line['jf_insertion'] = '' line['input_seqs'] = self.final_seqs line['indel_reversed_seqs'] = [] for iseq in range(len(self.indelfos)): if self.indelfos[iseq]['reversed_seq'] != '': line['indel_reversed_seqs'].append(self.indelfos[iseq]['reversed_seq']) else: line['indel_reversed_seqs'].append(line['input_seqs'][iseq]) line['seqs'] = line['indel_reversed_seqs'] line['indelfos'] = self.indelfos line['unique_ids'] = [str(i) for i in range(len(self.final_seqs))] line['cdr3_length'] = self.cdr3_length line['codon_positions'] = copy.deepcopy(self.final_codon_positions) utils.add_implicit_info(self.glfo, line) utils.print_reco_event(self.glfo['seqs'], line)
def add_to_info(self, query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions): assert query_name not in self.info self.info['queries'].append(query_name) self.info[query_name] = {} self.info[query_name]['unique_id'] = query_name # redundant, but used somewhere down the line self.info[query_name]['k_v'] = kvals['v'] self.info[query_name]['k_d'] = kvals['d'] self.info[query_name]['all'] = ':'.join(match_names['v'] + match_names['d'] + match_names['j']) # all gene matches for this query self.info[query_name]['cdr3_length'] = codon_positions['j'] - codon_positions['v'] + 3 #tryp_position_in_joined_seq - self.cyst_position + 3 self.info[query_name]['cyst_position'] = codon_positions['v'] self.info[query_name]['tryp_position'] = codon_positions['j'] # erosion, insertion, mutation info for best match self.info[query_name]['v_5p_del'] = all_germline_bounds[best['v']][0] self.info[query_name]['v_3p_del'] = len(self.glfo['seqs']['v'][best['v']]) - all_germline_bounds[best['v']][1] # len(germline v) - gl_match_end self.info[query_name]['d_5p_del'] = all_germline_bounds[best['d']][0] self.info[query_name]['d_3p_del'] = len(self.glfo['seqs']['d'][best['d']]) - all_germline_bounds[best['d']][1] self.info[query_name]['j_5p_del'] = all_germline_bounds[best['j']][0] self.info[query_name]['j_3p_del'] = len(self.glfo['seqs']['j'][best['j']]) - all_germline_bounds[best['j']][1] self.info[query_name]['fv_insertion'] = query_seq[ : all_query_bounds[best['v']][0]] self.info[query_name]['vd_insertion'] = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]] self.info[query_name]['dj_insertion'] = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]] self.info[query_name]['jf_insertion'] = query_seq[all_query_bounds[best['j']][1] : ] self.info[query_name]['indelfo'] = self.info['indels'].get(query_name, utils.get_empty_indel()) for region in utils.regions: self.info[query_name][region + '_gene'] = best[region] self.info['all_best_matches'].add(best[region]) self.info['all_matches'][region] |= set(match_names[region]) self.info[query_name]['seq'] = query_seq # NOTE this is the seq output by vdjalign, i.e. if we reversed any indels it is the reversed sequence existing_implicit_keys = tuple(['cdr3_length', 'cyst_position', 'tryp_position']) utils.add_implicit_info(self.glfo, self.info[query_name], multi_seq=False, existing_implicit_keys=existing_implicit_keys) if self.debug: if not self.args.is_data: utils.print_reco_event(self.glfo['seqs'], self.reco_info[query_name], extra_str=' ', label='true:') utils.print_reco_event(self.glfo['seqs'], self.info[query_name], extra_str=' ', label='inferred:') if self.alfinder is not None: self.alfinder.increment(self.info[query_name]) if self.pcounter is not None: self.pcounter.increment_all_params(self.info[query_name]) if self.true_pcounter is not None: self.true_pcounter.increment_all_params(self.reco_info[query_name]) if self.perfplotter is not None: if query_name in self.info['indels']: print ' skipping performance evaluation of %s because of indels' % query_name # I just have no idea how to handle naive hamming fraction when there's indels else: self.perfplotter.evaluate(self.reco_info[query_name], self.info[query_name]) self.remaining_queries.remove(query_name)
def read_annotations(fname, glfo): annotations = {} with open(fname.replace('.csv', '-cluster-annotations.csv')) as csvfile: reader = csv.DictReader(csvfile) for line in reader: # there's a line for each cluster if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line( line ) # converts strings in the csv file to floats/ints/dicts/etc. utils.add_implicit_info( glfo, line ) # add stuff to <line> that's useful, isn't written to the csv since it's redundant # utils.print_reco_event(line) # print ascii-art representation of the rearrangement event annotations[getkey(line['unique_ids'])] = line return annotations
def try_scratch_erode_insert(self, tmpline, debug=False): utils.remove_all_implicit_info(tmpline) for erosion in utils.real_erosions: # includes various contortions to avoid eroding the entire gene region = erosion[0] gene_length = len(self.glfo['seqs'][region][tmpline[region + '_gene']]) if region == 'd' and not utils.has_d_gene(self.args.locus): # dummy d genes: always erode the whole thing from the left assert gene_length == 1 and tmpline['d_gene'] == glutils.dummy_d_genes[self.args.locus] tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0 else: max_erosion = max(0, gene_length/2 - 2) # heuristic if region in utils.conserved_codons[self.args.locus]: # make sure not to erode a conserved codon codon_pos = utils.cdn_pos(self.glfo, region, tmpline[region + '_gene']) if '3p' in erosion: n_bases_to_codon = gene_length - codon_pos - 3 elif '5p' in erosion: n_bases_to_codon = codon_pos max_erosion = min(max_erosion, n_bases_to_codon) tmpline[erosion + '_del'] = min(max_erosion, numpy.random.geometric(1. / utils.scratch_mean_erosion_lengths[erosion]) - 1) for bound in utils.boundaries: mean_length = utils.scratch_mean_insertion_lengths[self.args.locus][bound] length = 0 if mean_length == 0 else numpy.random.geometric(1. / mean_length) - 1 probs = [self.insertion_content_probs[bound][n] for n in utils.nukes] tmpline[bound + '_insertion'] = ''.join(numpy.random.choice(utils.nukes, size=length, p=probs)) if debug: print ' erosions: %s' % (' '.join([('%s %d' % (e, tmpline[e + '_del'])) for e in utils.real_erosions])) print ' insertions: %s' % (' '.join([('%s %s' % (b, tmpline[b + '_insertion'])) for b in utils.boundaries])) # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator) gl_seqs = {r : self.glfo['seqs'][r][tmpline[r + '_gene']] for r in utils.regions} for erosion in utils.real_erosions: region = erosion[0] e_length = tmpline[erosion + '_del'] if '5p' in erosion: gl_seqs[region] = gl_seqs[region][e_length:] elif '3p' in erosion: gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) - e_length] tmpline['seqs'] = [gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] + tmpline['dj_insertion'] + gl_seqs['j'], ] tmpline['unique_ids'] = [None] # this is kind of hackey, but some things in the implicit info adder use it to get the number of sequences tmpline['input_seqs'] = copy.deepcopy(tmpline['seqs']) # NOTE has to be updated _immediately_ so seqs and input_seqs don't get out of sync tmpline['indelfos'] = [indelutils.get_empty_indel(), ] utils.add_implicit_info(self.glfo, tmpline) assert len(tmpline['in_frames']) == 1
def print_event(self): line = {} # collect some information into a form that print_reco_event understands # line['cdr3_length'] = self.cdr3_length for region in utils.regions: line[region + '_gene'] = self.genes[region] for boundary in utils.boundaries: line[boundary + '_insertion'] = self.insertions[boundary] for erosion in utils.real_erosions: line[erosion + '_del'] = self.erosions[erosion] for erosion in utils.effective_erosions: line[erosion + '_del'] = self.effective_erosions[erosion] # line['cyst_position'] = self.final_cyst_position # line['tryp_position'] = self.final_tryp_position assert 'fv_insertion' not in line # well, in principle it's ok if they're there, but in that case I'll need to at least think about updating some things assert 'jf_insertion' not in line line['fv_insertion'] = '' line['jf_insertion'] = '' line['seqs'] = self.final_seqs line['unique_ids'] = [i for i in range(len(self.final_seqs))] utils.add_implicit_info(self.glfo, line, multi_seq=True) utils.print_reco_event(self.glfo['seqs'], line, indelfos=self.indelfo)
def try_scratch_erode_insert(tmpline): utils.remove_all_implicit_info(tmpline) for erosion in utils.real_erosions: # includes various contortions to avoid eroding the entire gene region = erosion[0] gene_length = len(self.glfo['seqs'][region][tmpline[region + '_gene']]) if self.args.chain != 'h' and region == 'd': # light chains dummy d treatment assert gene_length == 1 and tmpline['d_gene'] == glutils.dummy_d_genes[self.args.chain] tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0 # always erode the whole dummy d from the left else: max_erosion = max(0, gene_length/2 - 2) # now that, son, is a heuristic if region in utils.conserved_codons[self.args.chain]: codon_pos = self.glfo[utils.conserved_codons[self.args.chain][region] + '-positions'][tmpline[region + '_gene']] if '3p' in erosion: n_bases_to_codon = gene_length - codon_pos - 3 elif '5p' in erosion: n_bases_to_codon = codon_pos max_erosion = min(max_erosion, n_bases_to_codon) tmpline[erosion + '_del'] = min(max_erosion, numpy.random.geometric(1. / utils.scratch_mean_erosion_lengths[erosion]) - 1) for bound in utils.boundaries: mean_length = utils.scratch_mean_insertion_lengths[self.args.chain][bound] length = 0 if mean_length == 0 else numpy.random.geometric(1. / mean_length) - 1 probs = [self.insertion_content_probs[bound][n] for n in utils.nukes] tmpline[bound + '_insertion'] = ''.join(numpy.random.choice(utils.nukes, size=length, p=probs)) # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator) gl_seqs = {r : self.glfo['seqs'][r][tmpline[r + '_gene']] for r in utils.regions} for erosion in utils.real_erosions: region = erosion[0] e_length = tmpline[erosion + '_del'] if '5p' in erosion: gl_seqs[region] = gl_seqs[region][e_length:] elif '3p' in erosion: gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) - e_length] tmpline['seqs'] = [gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] + tmpline['dj_insertion'] + gl_seqs['j'], ] tmpline['indelfos'] = [utils.get_empty_indel(), ] utils.add_implicit_info(self.glfo, tmpline) assert len(tmpline['in_frames']) == 1
def read_sequence_file(infname, is_data, n_max_queries=-1, args=None, simglfo=None, quiet=False, more_input_info=None): # NOTE renamed this from get_seqfile_info() since I'm changing the return values, but I don't want to update the calls everywhere (e.g. in compareutils) yaml_glfo = None suffix = utils.getsuffix(infname) if suffix in delimit_info: seqfile = open( infname ) # closes on function exit. no, this isn't the best way to do this reader = csv.DictReader(seqfile, delimiter=delimit_info[suffix]) elif suffix in ['.fa', '.fasta', '.fastx']: reader = utils.read_fastx( infname, name_key='unique_ids', seq_key='input_seqs', add_info=False, sanitize=True, n_max_queries= n_max_queries, # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below queries=(args.queries if (args is not None and not args.abbreviate) else None) ) # NOTE also can't filter on args.queries here if we're also translating elif suffix == '.yaml': yaml_glfo, reader, _ = utils.read_yaml_output( infname, n_max_queries=n_max_queries, synth_single_seqs=True, dont_add_implicit_info=True ) # not really sure that long term I want to synthesize single seq lines, but for backwards compatibility it's nice a.t.m. if not is_data: simglfo = yaml_glfo # doesn't replace the contents, of course, which is why we return it else: raise Exception('unhandled file extension %s' % suffix) input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() # already_printed_forbidden_character_warning = False n_queries_added = 0 found_seed = False potential_names, used_names = None, None # for abbreviating iname = None # line number -- used as sequence id if there isn't a name column in the file iline = -1 for line in reader: iline += 1 if args is not None: if args.istartstop is not None: if iline < args.istartstop[0]: continue if iline >= args.istartstop[1]: break if args.name_column is not None: line['unique_ids'] = line[args.name_column] del line[args.name_column] if args.seq_column is not None: line['input_seqs'] = line[args.seq_column] if args.seq_column != 'seqs': # stupid god damn weird backwards compatibility edge case bullshit del line[args.seq_column] if iname is None and 'unique_ids' not in line and 'unique_id' not in line: print ' %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % ( utils.color('yellow', 'warning')) iname = 0 if iname is not None: line['unique_ids'] = '%09d' % iname iname += 1 if 'input_seqs' not in line and 'seq' not in line: raise Exception( 'couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname) if suffix != '.yaml': utils.process_input_line(line) if len(line['unique_ids']) > 1: raise Exception('can\'t yet handle multi-seq csv input files') uid = line['unique_ids'][0] if uid in input_info: new_uid = uid iid = 2 while new_uid in input_info: new_uid = uid + '-' + str(iid) iid += 1 print ' %s uid %s already read from input file %s, so replacing with new uid %s' % ( utils.color('yellow', 'warning'), uid, infname, new_uid) uid = new_uid inseq = line['input_seqs'][0] # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above if it has them # if any(fc in uid for fc in utils.forbidden_characters): # raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid)) if args is not None: if args.abbreviate: # note that this changes <uid>, but doesn't modify <line> uid, potential_names, used_names = utils.choose_new_uid( potential_names, used_names) if args.queries is not None and uid not in args.queries: continue if args.reco_ids is not None and line[ 'reco_id'] not in args.reco_ids: continue if args.seed_unique_id is not None and uid == args.seed_unique_id: found_seed = True if uid in input_info: raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname)) if any(c not in utils.alphabet for c in inseq): unexpected_chars = set( [ch for ch in inseq if ch not in utils.alphabet]) raise Exception( 'unexpected character%s %s (not among %s) in input sequence with id %s:\n %s' % (utils.plural(len(unexpected_chars)), ', '.join([ ('\'%s\'' % ch) for ch in unexpected_chars ]), utils.nukes + utils.ambiguous_bases, uid, inseq)) # da business input_info[uid] = { 'unique_ids': [ uid, ], 'seqs': [ inseq, ] } if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % infname) reco_info[uid] = copy.deepcopy(line) if simglfo is not None: utils.add_implicit_info(simglfo, reco_info[uid]) for line_key in utils.input_metafile_keys.values(): if line_key in reco_info[ uid]: # this is kind of weird to copy from sim info to input info, but it makes sense because affinity is really meta info (the only other place affinity could come from is --input-metafname below). Where i'm defining meta info more or less as any input info besides name and sequence (i think the distinction is only really important because we want to support fastas, which can't [shouldn't!] handle anything else)) input_info[uid][line_key] = copy.deepcopy( reco_info[uid][line_key] ) # note that the args.input_metafname stuff below should print a warning if you've also specified that (which you shouldn't, if it's simulation) n_queries_added += 1 if n_max_queries > 0 and n_queries_added >= n_max_queries: if not quiet: # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now print ' --n-max-queries: stopped after reading %d queries from input file' % len( input_info) break if more_input_info is not None: # if you use this on simulation, the extra queries that aren't in <reco_info> may end up breaking something down the line (but I don't imagine this really getting used on simulation) if len(set(more_input_info) & set(input_info)) > 0: print ' %s found %d queries in both --infname and --queries-to-include-fname (note that we don\'t check here that they correspond to the same sequence): %s' % ( utils.color('red', 'note:'), len(set(more_input_info) & set(input_info)), ' '.join(set(more_input_info) & set(input_info)) ) # not necessarily a problem, but you probably *shouldn't* have sequences floating around in two different files if args is not None and args.seed_unique_id is not None and args.seed_unique_id in more_input_info: found_seed = True input_info.update(more_input_info) if args is not None and args.input_metafname is not None: read_input_metafo(args.input_metafname, input_info.values(), debug=True) post_process(input_info, reco_info, args, infname, found_seed, is_data, iline) if len(input_info) == 0: raise Exception('didn\'t read any sequences from %s' % infname) return input_info, reco_info, yaml_glfo
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx( '%s/%s.fasta' % (outdir, args.extrastr)) # output mutated sequences from bcr-phylo assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) reco_info[sfo['name']] = mline utils.add_implicit_info(glfo, mline) final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': cmd = 'export PATH=%s:$PATH && xvfb-run -a python ./bin/view-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % ( ete_path, outdir, args.extrastr, outdir, outdir) utils.simplerun(cmd, shell=True) kdvals = {} with open('%s/kd-vals.csv' % outdir) as kdfile: reader = csv.DictReader(kdfile) for line in reader: kdvals[line['uid']] = float(line['kd']) if len( set(kdvals) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(kdvals) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(kdvals)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(kdvals)) final_line['affinities'] = [ 1. / kdvals[u] for u in final_line['unique_ids'] ] tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] from Bio.Seq import Seq final_line['nearest_target_indices'] = [] aa_targets = [Seq(seq).translate() for seq in final_line['target_seqs']] for mseq in final_line['input_seqs']: aa_mseq = Seq(mseq).translate() aa_hdists = [ utils.hamming_distance(aa_t, aa_mseq, amino_acid=True) for aa_t in aa_targets ] imin = aa_hdists.index( min(aa_hdists) ) # NOTE doesn't do anything differently if there's more than one min final_line['nearest_target_indices'].append(imin) return final_line
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx(bcr_phylo_fasta_fname( outdir)) # output mutated sequences from bcr-phylo assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline utils.add_implicit_info(glfo, mline) final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % ( outdir, args.extrastr, outdir, outdir) utils.run_ete_script(cmd, ete_path) nodefo = {} with open('%s/kd-vals.csv' % outdir) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd': float(line['kd']), 'relative_kd': float(line['relative_kd']), 'lambda': line.get('lambda', None), 'target_index': int(line['target_index']), } if len( set(nodefo) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [ 1. / nodefo[u]['kd'] for u in final_line['unique_ids'] ] final_line['relative_affinities'] = [ 1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids'] ] final_line['lambdas'] = [ nodefo[u]['lambda'] for u in final_line['unique_ids'] ] final_line['nearest_target_indices'] = [ nodefo[u]['target_index'] for u in final_line['unique_ids'] ] tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir) tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] return final_line
def get_seqfile_info(infname, is_data, n_max_queries=-1, args=None, glfo=None, simglfo=None): """ return list of sequence info from files of several types """ if not is_data and glfo is None: print ' WARNING glfo is None, so not adding implicit info' suffix = os.path.splitext(infname)[1] if len(re.findall('\.[ct]sv', suffix)) > 0: if suffix == '.csv': delimiter = ',' elif suffix == '.tsv': delimiter = '\t' else: assert False seqfile = open(infname) reader = csv.DictReader(seqfile, delimiter=delimiter) else: reader = utils.read_fastx( infname, name_key='unique_ids', seq_key='input_seqs', add_info=False, sanitize=True, queries=(args.queries if args is not None else None), n_max_queries=n_max_queries) input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() # already_printed_forbidden_character_warning = False n_queries_added = 0 found_seed = False used_names = set() # for abbreviating if args is not None and args.abbreviate: potential_names = list(string.ascii_lowercase) iname = None # line number -- used as sequence id if there isn't a name column in the file iline = -1 for line in reader: iline += 1 if args is not None: if args.istartstop is not None: if iline < args.istartstop[0]: continue if iline >= args.istartstop[1]: break if args.name_column is not None: line['unique_ids'] = line[args.name_column] del line[args.name_column] if args.seq_column is not None: line['input_seqs'] = line[args.seq_column] if args.seq_column != 'seqs': # stupid god damn weird backwards compatibility edge case bullshit del line[args.seq_column] if iname is None and 'unique_ids' not in line and 'unique_id' not in line: print ' %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % ( utils.color('yellow', 'warning')) iname = 0 if iname is not None: line['unique_ids'] = '%09d' % iname iname += 1 if 'input_seqs' not in line and 'seq' not in line: raise Exception( 'couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname) utils.process_input_line(line) if len(line['unique_ids']) > 1: raise Exception('can\'t yet handle multi-seq csv input files') uid = line['unique_ids'][0] inseq = line['input_seqs'][0] # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them # if any(fc in uid for fc in utils.forbidden_characters): # raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid)) if args is not None: if args.abbreviate: # note that this changes <uid>, but doesn't modify <line> uid = abbreviate(used_names, potential_names, uid) if args.queries is not None and uid not in args.queries: continue if args.reco_ids is not None and line[ 'reco_id'] not in args.reco_ids: continue if args.seed_unique_id is not None and uid == args.seed_unique_id: found_seed = True if uid in input_info: raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname)) if len(inseq.translate(None, ''.join(utils.alphabet))) > 0: raise Exception( 'unexpected character (not among %s) in input sequence with id %s:\n %s' % (utils.nukes + utils.ambiguous_bases, uid, inseq)) input_info[uid] = { 'unique_ids': [ uid, ], 'seqs': [ inseq, ] } if n_queries_added == 0 and is_data and 'v_gene' in line: print ' note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % infname) reco_info[uid] = copy.deepcopy(line) if simglfo is not None: utils.add_implicit_info(simglfo, reco_info[uid]) n_queries_added += 1 if n_max_queries > 0 and n_queries_added >= n_max_queries: break post_process(input_info, reco_info, args, infname, found_seed, is_data) if len(input_info) == 0: raise Exception('didn\'t read any sequences from %s' % infname) return input_info, reco_info
# formatting necessity def getkey(uid_list): return ':'.join(uid_list) # creates a dictionary with keys = unique_ids and values = annotations annotations = {} with open(args.infile.replace('.csv', '-cluster-annotations.csv')) as csvfile: reader = csv.DictReader(csvfile) for line in reader: # there's a line for each cluster if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line( line) # converts strings in the csv file to floats/ints/dicts/etc. utils.add_implicit_info( glfo, line ) # add stuff to <line> that's useful, isn't written to the csv since it's redundant # utils.print_reco_event(line) # print ascii-art representation of the rearrangement event annotations[getkey(line['unique_ids'])] = line # sort by size sorted_clusters = sorted(annotations, key=lambda q: len(annotations[q]['unique_ids']), reverse=True) #### sorted_clusters = [c for c in sorted_clusters if utils.is_functional(annotations[c])] # checks if the cluster contains ANY non-functional sequences # total size of repertoire (number sequences) n_total = sum([len(cluster) for cluster in sorted_clusters]) # add more criteria
def get_seqfile_info(infname, is_data, n_max_queries=-1, args=None, simglfo=None, quiet=False): """ return list of sequence info from files of several types """ suffix = utils.getsuffix(infname) if len(re.findall('\.[ct]sv', suffix)) > 0: if suffix == '.csv': delimiter = ',' elif suffix == '.tsv': delimiter = '\t' else: assert False seqfile = open(infname) reader = csv.DictReader(seqfile, delimiter=delimiter) else: reader = utils.read_fastx( infname, name_key='unique_ids', seq_key='input_seqs', add_info=False, sanitize=True, n_max_queries= n_max_queries, # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below queries=(args.queries if (args is not None and not args.abbreviate) else None) ) # NOTE also can't filter on args.queries here if we're also translating input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() # already_printed_forbidden_character_warning = False n_queries_added = 0 found_seed = False used_names = set() # for abbreviating if args is not None and args.abbreviate: potential_names = list(string.ascii_lowercase) iname = None # line number -- used as sequence id if there isn't a name column in the file iline = -1 for line in reader: iline += 1 if args is not None: if args.istartstop is not None: if iline < args.istartstop[0]: continue if iline >= args.istartstop[1]: break if args.name_column is not None: line['unique_ids'] = line[args.name_column] del line[args.name_column] if args.seq_column is not None: line['input_seqs'] = line[args.seq_column] if args.seq_column != 'seqs': # stupid god damn weird backwards compatibility edge case bullshit del line[args.seq_column] if iname is None and 'unique_ids' not in line and 'unique_id' not in line: print ' %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % ( utils.color('yellow', 'warning')) iname = 0 if iname is not None: line['unique_ids'] = '%09d' % iname iname += 1 if 'input_seqs' not in line and 'seq' not in line: raise Exception( 'couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname) utils.process_input_line(line) if len(line['unique_ids']) > 1: raise Exception('can\'t yet handle multi-seq csv input files') uid = line['unique_ids'][0] if uid in input_info: new_uid = uid iid = 2 while new_uid in input_info: new_uid = uid + '-' + str(iid) iid += 1 print ' %s uid %s already read from input file %s, so replacing with new uid %s' % ( utils.color('yellow', 'warning'), uid, infname, new_uid) uid = new_uid inseq = line['input_seqs'][0] # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them # if any(fc in uid for fc in utils.forbidden_characters): # raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid)) if args is not None: if args.abbreviate: # note that this changes <uid>, but doesn't modify <line> uid = abbreviate(used_names, potential_names, uid) if args.queries is not None and uid not in args.queries: continue if args.reco_ids is not None and line[ 'reco_id'] not in args.reco_ids: continue if args.seed_unique_id is not None and uid == args.seed_unique_id: found_seed = True if uid in input_info: raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname)) if len(inseq.translate(None, ''.join(utils.alphabet))) > 0: unexpected_chars = set( [ch for ch in inseq if ch not in utils.alphabet]) raise Exception( 'unexpected character%s %s (not among %s) in input sequence with id %s:\n %s' % (utils.plural(len(unexpected_chars)), ', '.join([ ('\'%s\'' % ch) for ch in unexpected_chars ]), utils.nukes + utils.ambiguous_bases, uid, inseq)) # da business input_info[uid] = { 'unique_ids': [ uid, ], 'seqs': [ inseq, ] } if n_queries_added == 0 and is_data and 'reco_id' in line: print ' note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % infname) reco_info[uid] = copy.deepcopy(line) if simglfo is not None: utils.add_implicit_info(simglfo, reco_info[uid]) n_queries_added += 1 if n_max_queries > 0 and n_queries_added >= n_max_queries: if not quiet: # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now print ' --n-max-queries: stopped after reading %d queries from input file' % len( input_info) break post_process(input_info, reco_info, args, infname, found_seed, is_data, iline) if len(input_info) == 0: raise Exception('didn\'t read any sequences from %s' % infname) return input_info, reco_info
def get_seqfile_info(fname, is_data, glfo=None, n_max_queries=-1, queries=None, reco_ids=None): """ return list of sequence info from files of several types """ suffix = os.path.splitext(fname)[1] if suffix == '.csv': delimiter = ',' name_column = 'unique_id' seq_column = 'seq' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) elif suffix == '.tsv': delimiter = '\t' name_column = 'name' seq_column = 'nucleotide' seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) else: if suffix == '.fasta' or suffix == '.fa': ftype = 'fasta' elif suffix == '.fastq' or suffix == '.fq': ftype = 'fastq' else: raise Exception('couldn\'t handle file extension for %s' % fname) name_column = 'unique_id' seq_column = 'seq' reader = [] n_fasta_queries = 0 for seq_record in SeqIO.parse(fname, ftype): # if command line specified query or reco ids, skip other ones if queries is not None and seq_record.name not in queries: continue # if reco_ids is not None and line['reco_id'] not in reco_ids: # probably no reco ids in a fasta file # continue reader.append({}) reader[-1][name_column] = seq_record.name reader[-1][seq_column] = str(seq_record.seq).upper() n_fasta_queries += 1 if n_max_queries > 0 and n_fasta_queries >= n_max_queries: break input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() n_queries = 0 for line in reader: if '.csv' in fname and name_column not in line: # hackey hackey hackey name_column = 'name' seq_column = 'nucleotide' utils.process_input_line(line) unique_id = line[name_column] if ':' in unique_id: raise Exception('found a \':\' in sequence id \'%s\' -- you\'ll have to replace it with something else, as we use \':\'s internally to concatenate sequence ids' % unique_id) # if command line specified query or reco ids, skip other ones if queries is not None and unique_id not in queries: continue if reco_ids is not None and line['reco_id'] not in reco_ids: continue input_info[unique_id] = {'unique_id' : unique_id, 'seq' : line[seq_column]} if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s -- if this is data add option --is-data' % fname) reco_info[unique_id] = dict(line) if 'indels' in line and line['indels']['reversed_seq'] != '': # TODO unhackify this reco_info[unique_id]['seq'] = line['indels']['reversed_seq'] if 'indels' not in line: # TODO unhackify this reco_info[unique_id]['indels'] = None if glfo is not None: utils.remove_implicit_info(reco_info[unique_id], multi_seq=False) utils.add_implicit_info(glfo, reco_info[unique_id], multi_seq=False) # each seq is on its own line in the file n_queries += 1 if n_max_queries > 0 and n_queries >= n_max_queries: break if len(input_info) == 0: raise Exception('didn\'t end up pulling any input info out of %s while looking for queries: %s reco_ids: %s\n' % (fname, str(queries), str(reco_ids))) return (input_info, reco_info)
def partis_naive_seq(lseq, fnam): ''' Given a number of sequences infer the naive sequence using partis. ''' # Specify filenames: pretty_random_fnam = str(random.randint(1, 10**100)) inpf = pretty_random_fnam + '_input' outf = pretty_random_fnam + '_output' # Write input fasta file for partis: with open(TMPDIR+'/'+inpf+'.fa', 'w') as fho: for i, s in enumerate(lseq): fho.write('>{}\n{}\n'.format(str(i), s)) # Run partis: cmd = '{}/bin/partis partition --locus {} --species {} --infname {}/{}.fa --outfname {}/{}.csv'.format(partis_path, args.LOCUS, args.SPECIES, TMPDIR, inpf, TMPDIR, outf) # os.system(cmd) # Print partis STDOUT to screen os.system('{} > {}/{}.log'.format(cmd, TMPDIR, pretty_random_fnam)) try: # Read the partis output file and extract the naive sequence: with open(TMPDIR+'/'+outf+'-cluster-annotations.csv') as fh: reader = csv.DictReader(fh) data = list(reader) # assert(len(data) == 1) # There should really only be one clonal family, but there often are, so just take the first (largest) # Extract germline bounds info and trim the naive DNA sequence: try: utils.process_input_line(data[0]) # Process dataframe row fnam_base = fnam.split('_partitions')[0].split('/') #glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]), locus=args.LOCUS) glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus=args.LOCUS) utils.add_implicit_info(glfo, data[0]) # Adding germline infor except Exception as e: print e raise e naiveDNA = data[0]['naive_seq'][:] first_lseq = data[0]['input_seqs'][:][0] vj_bounds = (data[0]['regional_bounds']['v'][0], data[0]['regional_bounds']['j'][1]) naiveDNA = repair_new_naive(naiveDNA[:], naiveDNA[:], vj_bounds) first_lseq = repair_new_naive(first_lseq, naiveDNA[:], vj_bounds) try: assert(len(first_lseq) == len(naiveDNA)) except: print 'len(first_lseq) != len(data[0]["naive_seq"])' print len(first_lseq) print first_lseq print len(naiveDNA) print naiveDNA # If the inferred naive sequence contains a stop codon replace it by the input sequence codon: if '*' in str(Seq(naiveDNA, generic_dna).translate()): print 'Found stop codon in inferred naive sequnce, will replace with input sequence codon.' print 'Before replacement:', naiveDNA naiveDNA_l = list(naiveDNA[:]) for codon in range(vj_bounds[0], vj_bounds[1], 3): if '*' == str(Seq(naiveDNA[codon:codon+3], generic_dna).translate()): naiveDNA_l[codon:codon+3] = first_lseq[codon:codon+3] naiveDNA = ''.join(naiveDNA_l) print 'After replacement:', naiveDNA if naiveDNA == first_lseq: print 'Complaining to say naiveDNA == first_lseq (nothing bad just to be sure the repair is not just replacing the naive sequence with the input entirely)' return(naiveDNA) finally: # Clean up: os.system('rm -r {}/{}* _output/*{}*'.format(TMPDIR, pretty_random_fnam, pretty_random_fnam))
def extract_seqs(fnam): ''' Reads a partis cluster-annotations file and extracts relevant information and sequences. ''' # Read cluster annotations into a data list of dictionaries: with open(fnam) as fh: reader = csv.DictReader(fh) data = list(reader) sequences_i = list() info_i = list() if args.allele_finding: fnam_base = fnam.split('_partitions')[0].split('/') glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]), locus=args.LOCUS) else: glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus=args.LOCUS) for row in data: # Process the partis data row and add germline information: try: utils.process_input_line(row) # Read default germline info utils.add_implicit_info(glfo, row) except Exception as e: # Skip rows that cannot be processed if 'failed annotation' not in e: pass # print('First skip') # print(e) else: print 'Reading from' print '{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]) print e continue # # Process the partis data row and add germline information: # try: # utils.process_input_line(row) # utils.add_implicit_info(glfo, row) # except: # Skip rows that cannot be processed # continue # Extract the full N padded naive sequence, # and find the v -and j gene bound on this naive sequence: cdr3_bounds = (row['codon_positions']['v'], row['codon_positions']['j'] + 3) vj_bounds = (row['regional_bounds']['v'][0], row['regional_bounds']['j'][1]) naiveDNA = row['naive_seq'] # Skip naive sequences too short or with stop codons: if repair_seq(naiveDNA, naiveDNA, vj_bounds, keep_check=True) is False: continue trimmed_naiveDNA = repair_seq(naiveDNA[:], naiveDNA[:], vj_bounds) naiveAA = str(Seq(trimmed_naiveDNA, generic_dna).translate()) # There has been a name change and this try/except is meant to provide backwards compatability: try: lseq = row['input_seqs'][:] except: lseq = row['seqs'][:] ir_lseq = row['indel_reversed_seqs'] stop_seq = row['stops'] assert(len(lseq) == len(ir_lseq)) assert(len(lseq) == len(stop_seq)) # Only keep sequences without indels and stop codons and minimum length amino acid length (QC): ### ir_lseq[i] == '' or lseq[i] == ir_lseq[i] <-- No indels ### stop_seq[i] <-- No partis annotated stops (there seems still to be stops after these are removed though) ### repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True) <-- Checks whether the sequence is long enougth or have stop codons keep_idx = [1 if ((ir_lseq[i] == '' or lseq[i] == ir_lseq[i]) and stop_seq[i] is False and repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)) else 0 for i in range(len(lseq))] # Now only keep those sequences that passed QC: lseq = [s for s, keep in zip(lseq, keep_idx) if keep == 1] # Get amino acid sequences: lAAseq = [str(Seq(repair_seq(s[:], naiveDNA[:], vj_bounds), generic_dna).translate()) for s in lseq] # And mutation frequencies: mut_freqs = [s for s, keep in zip(row['mut_freqs'], keep_idx) if keep == 1] assert(len(mut_freqs) == len(lseq)) # Convert frequency to counts: Nmuts = [int(round(float(t[0])*len(t[1].strip('N')))) for i, t in enumerate(zip(mut_freqs, lseq))] # Deduplicate AAseqs and lseq according to the duplications on amino acid level: lAAseq_dict = dict() lseq_unique = list() for i, aa in enumerate(lAAseq): if aa in lAAseq_dict: lAAseq_dict[aa].append(i) else: lAAseq_dict[aa] = [i] lseq_unique.append(repair_seq(lseq[i][:], naiveDNA[:], vj_bounds)) assert(len(lAAseq_dict) == len(lseq_unique)) # Make the deduplicated sequence list and the mutation rates: lAAseq_dedup = list() Nmuts_dedup = list() for aa, idxs in lAAseq_dict.items(): lAAseq_dedup.append(aa) Nmut_list = [float(Nmuts[i]) for i in idxs] Nmuts_dedup.append(int(round(sum(Nmut_list)/len(Nmut_list)))) assert(len(lAAseq_dedup) == len(Nmuts_dedup)) assert(len(lAAseq_dedup) == len(lseq_unique)) # Exclude small clonal families after all the QC and deduplication: if len(lAAseq_dedup) < args.MIN_OBS: continue # Store the results in a list: sequences_i.append(['naive_seq', naiveAA]) # This format is for ANARCI numbering info_i.append({'fnam': fnam, 'v_gene': row['v_gene'], 'd_gene': row['d_gene'], 'j_gene': row['j_gene'], 'naive_seq': naiveAA, 'naive_seq_DNA': trimmed_naiveDNA, 'Nmuts': Nmuts_dedup[:], 'AAseqs': lAAseq_dedup[:], 'DNAseqs': lseq_unique[:]}) return(sequences_i, info_i)
def write_partis_data_from_annotations( output_genes, output_seqs, path_to_annotations, metadata, filters={}, seq_filters={}, min_clonal_family_size=0, min_seq_len=0, max_mut_pct=1., min_mut_pct=0., clone_str='', region='v', germline_family='v', ): """ Function to read partis annotations csv @param path_to_annotations: path to annotations files @param metadata: csv file of metadata; if None defaults will be used for chain/species @param filters: dictionary of lists with keys as column name and items as those values of the column variable to retain; filters out families, e.g., {'locus': ['igk']}, etc. @param seq_filters: same as filters, but for sequences, e.g., {indel_reversed_seqs': [''], 'in_frames': [False]} will only retain sequences that are out of frame and did not have an indel @param min_clonal_family_size: minimum clonal family size @param min_seq_len: minimum sequence length @param max_mut_pct: maximum mutation percentage @param min_mut_pct: minimum mutation percentage @param clone_str: string for identifying clones (useful if merging annotations from multiple datasets) @param region: B-cell receptor region ('v', 'd', 'j', or 'vdj') @param germline_family: for performing cross validation ('v', 'd', or 'j') @write genes to output_genes and seqs to output_seqs """ families = ['v', 'd', 'j'] if germline_family not in families: raise ValueError("Invalid germline_family: %s. Must be one of %s" % (germline_family, families)) regions = ['v', 'd', 'j', 'vdj'] if region not in regions: raise ValueError("Invalid region: %s. Must be one of %s" % (region, regions)) PARTIS_PATH = os.path.dirname(os.path.realpath(__file__)) + '/partis' sys.path.insert(1, PARTIS_PATH + '/python') from utils import add_implicit_info, process_input_line import glutils partition_info = get_partition_info( path_to_annotations, metadata, ) with open(output_genes, 'w') as genes_file, open(output_seqs, 'w') as seqs_file: gene_writer = csv.DictWriter(genes_file, ['germline_name', 'germline_sequence']) gene_writer.writeheader() seq_header = [ 'germline_name', 'sequence_name', 'sequence', 'germline_family', 'v_gene', 'region', ] for key, _ in partition_info[0].iteritems(): seq_header += [key] seq_writer = csv.DictWriter(seqs_file, seq_header) seq_writer.writeheader() for data_idx, data_info in enumerate(partition_info): if any([ data_info[key] not in values for key, values in filters.iteritems() ]): continue glfo = glutils.read_glfo(data_info['germline_file'], locus=data_info['locus']) with open(data_info['annotations_file'], "r") as csvfile: reader = csv.DictReader(csvfile) for idx, line in enumerate(reader): if line['v_gene'] == '': # failed annotations continue # add goodies from partis process_input_line(line) add_implicit_info(glfo, line) n_seqs = len(line['input_seqs']) if n_seqs < min_clonal_family_size: # don't take small clonal families---for data quality purposes continue if region == 'vdj': gl_seq = line['naive_seq'].lower() all_seqs = [seq.lower() for seq in line['seqs']] else: gl_seq = line['v_gl_seq'].lower() all_seqs = [seq.lower() for seq in line['v_qr_seqs']] idx_list = [] # frequency filter idx_list.append( set([ i for i, val in enumerate(line['mut_freqs']) if val < max_mut_pct and val >= min_mut_pct ])) # sequence length filter idx_list.append( set([ i for i, val in enumerate(all_seqs) if len(val.translate(None, 'n')) > min_seq_len ])) for key, values in seq_filters.iteritems(): idx_list.append( set([ i for i, val in enumerate(line[key]) if val in values ])) good_seq_idx = set.intersection(*idx_list) if not good_seq_idx: # no sequences after filtering... skip continue gl_name = 'clone{}-{}-{}'.format( *[data_idx, idx, clone_str]) gene_writer.writerow({ 'germline_name': gl_name, 'germline_sequence': gl_seq, }) for good_idx in good_seq_idx: base_dict = { 'germline_name': gl_name, 'sequence_name': '-'.join([gl_name, line['unique_ids'][good_idx]]), 'sequence': all_seqs[good_idx].lower(), 'germline_family': line['{}_gene'.format(germline_family)][:5], 'v_gene': line['v_gene'], 'region': region, } for key, value in data_info.iteritems(): base_dict[key] = value seq_writer.writerow(base_dict)
def run_partis(seq): ''' Infer VDJ genes and the naive sequence using partis. ''' # Specify filenames: pretty_random_fnam = str(random.randint(1, 10**100)) inpf = pretty_random_fnam + '_input' outf = pretty_random_fnam + '_output' # Write input fasta file for partis: with open(TMPDIR + '/' + inpf + '.fa', 'w') as fho: fho.write('>{}\n{}\n'.format('input_sequence', seq)) # Run partis: cmd = '{}/bin/partis annotate --locus {} --species {} --infname {}/{}.fa --outfname {}/{}.csv'.format( partis_path, args.LOCUS, args.SPECIES, TMPDIR, inpf, TMPDIR, outf) os.system('{} > {}/{}.log'.format(cmd, TMPDIR, pretty_random_fnam)) try: # Read the partis output file and extract the naive sequence: with open(TMPDIR + '/' + outf + '.csv') as fh: reader = csv.DictReader(fh) data = list(reader) ann = data[0] # Extract germline bounds info and trim the naive DNA sequence: try: utils.process_input_line(ann) # Process dataframe row utils.add_implicit_info(glfo, ann) # Adding germline infor except Exception as e: print e raise e if ann['stops'] is True: raise Exception( 'Input sequence contain stop codon. This is no valid.') elif ann['v_5p_del'] > 30 or ann['j_3p_del'] > 12: raise Exception( 'Incomplete input sequence error. 5-prime end missing {} nt and 3-prime missing {} nt. Max allowed is 30 and 12, respectively.' .format(ann['v_5p_del'], ann['j_3p_del'])) elif ann['indelfos'][0]['indels']: raise Exception( 'Input sequence contains indels, this is currently not supported.' ) # Extract full size VDJ sequence for both the inferred naive and the input: full_gl_v = glfo['seqs']['v'][ann['v_gene']] # Germline V full_gl_j = glfo['seqs']['j'][ann['j_gene']] # Germline J gl_v_5p_del = full_gl_v[:ann[ 'v_5p_del']] # 5-prime not included in input gl_j_3p_del = full_gl_j[( len(full_gl_j) - ann['j_3p_del']):] # 3-prime not included in input #assert full_gl_v[ann['v_5p_del']:] == ann['v_gl_seq'] naiveDNA = gl_v_5p_del + ann[ 'naive_seq'] + gl_j_3p_del # Add the missing positions full_input_seq = 'N' * ann['v_5p_del'] + ann['input_seqs'][ 0] + 'N' * ann['j_3p_del'] # N pad the input sequence assert (len(naiveDNA) == len(full_input_seq)) # Remove the untranslated end: if len(naiveDNA) % 3 != 0: naiveDNA = naiveDNA[0:-(len(naiveDNA) % 3)] if len(full_input_seq) % 3 != 0: full_input_seq = full_input_seq[0:-(len(full_input_seq) % 3)] if len(naiveDNA) != len(full_input_seq): raise Exception( 'Sequences not equally long after trimming.\nInput: {}\nNaive: {}\n.' .format(full_input_seq, naiveDNA)) # Replace Ns in input sequence with naive DNA bases: full_input_seq = repair_seq(full_input_seq, naiveDNA[:]) # If the inferred naive sequence contains a stop codon replace it by the input sequence codon: if '*' in str(Seq(naiveDNA, generic_dna).translate()): print 'Found stop codon in inferred naive sequnce, will replace with input sequence codon.' print 'Before replacement:', naiveDNA naiveDNA_l = list(naiveDNA[:]) for codon in range(0, len(naiveDNA), 3): if '*' == str( Seq(naiveDNA[codon:codon + 3], generic_dna).translate()): naiveDNA_l[codon:codon + 3] = full_input_seq[codon:codon + 3] naiveDNA = ''.join(naiveDNA_l) print 'After replacement:', naiveDNA if '*' in str(Seq(naiveDNA, generic_dna).translate()): raise Exception('Naive sequence could not be repaired.') if naiveDNA == full_input_seq: print 'Warning: input sequence is identical to the inferred naive sequence.' finally: # Clean up: os.system('rm -r {}/{}* _output/*{}*'.format(TMPDIR, pretty_random_fnam, pretty_random_fnam)) return (naiveDNA, full_input_seq, (ann['v_gene'], ann['d_gene'], ann['j_gene']))
import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', chain='h') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(glfo['seqs'], line) cdr3_bounds = (line['codon_positions']['v'], line['codon_positions']['j'] + 3) print '' print ' should match the above:' print ' %s naive cdr3' % line['naive_seq'][cdr3_bounds[0] : cdr3_bounds[1]] print ' %s mature' % line['indel_reversed_seqs'][0][cdr3_bounds[0] : cdr3_bounds[1]] print '' break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir)) # output mutated sequences from bcr-phylo assert len(naive_line['unique_ids']) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels(naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline try: utils.add_implicit_info(glfo, mline) except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file print 'implicit info adding failed for ievent %d in %s' % (ievent, outdir) lines = traceback.format_exception(*sys.exc_info()) print utils.pad_lines(''.join(lines)) # NOTE this will still crash on the next line if implicit info adding failed final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4): # eh, don't really need to check for both kd an nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname) utils.run_ete_script(cmd, ete_path, debug=args.n_procs==1) nodefo = {} with open(kdfname) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd' : float(line['kd']), 'relative_kd' : float(line['relative_kd']), 'lambda' : line.get('lambda', None), 'target_index' : int(line['target_index']), } if len(set(nodefo) - set(final_line['unique_ids'])) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']] final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']] final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']] final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']] tree = treeutils.get_dendro_tree(treefname=nwkfname) tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent(glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id(final_line, irandom=ievent) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] return final_line
def get_seqfile_info(fname, is_data, glfo=None, n_max_queries=-1, queries=None, reco_ids=None, name_column=None, seq_column=None, seed_unique_id=None, abbreviate_names=False): """ return list of sequence info from files of several types """ # WARNING defaults for <name_column> and <seq_column> also set in partis (since we call this from places other than partis, but we also want people to be able set them from the partis command line) internal_name_column = 'unique_id' # key we use in the internal dictionaries internal_seq_column = 'seq' if name_column is None: # header we expect in the file name_column = internal_name_column if seq_column is None: seq_column = internal_seq_column if not is_data and glfo is None: print ' WARNING glfo is None, so not adding implicit info' suffix = os.path.splitext(fname)[1] if len(re.findall('\.[ct]sv', suffix)) > 0: if suffix == '.csv': delimiter = ',' elif suffix == '.tsv': delimiter = '\t' else: assert False seqfile = opener('r')(fname) reader = csv.DictReader(seqfile, delimiter=delimiter) else: if suffix == '.fasta' or suffix == '.fa': ftype = 'fasta' elif suffix == '.fastq' or suffix == '.fq': ftype = 'fastq' else: raise Exception('couldn\'t handle file extension for %s' % fname) reader = [] n_fasta_queries = 0 for seq_record in SeqIO.parse(fname, ftype): # if command line specified query or reco ids, skip other ones (can't have/don't allow simulation info in a fast[aq]) if queries is not None and seq_record.name not in queries: continue reader.append({}) reader[-1][name_column] = seq_record.name reader[-1][seq_column] = str(seq_record.seq).upper() n_fasta_queries += 1 if n_max_queries > 0 and n_fasta_queries >= n_max_queries: break input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() n_queries = 0 found_seed = False used_names = set() # for abbreviating if abbreviate_names: potential_names = list(string.ascii_lowercase) for line in reader: if name_column not in line or seq_column not in line: raise Exception('mandatory headers \'%s\' and \'%s\' not both present in %s (you can set column names with --name-column and --seq-column)' % (name_column, seq_column, fname)) if name_column != internal_name_column or seq_column != internal_seq_column: translate_columns(line, {name_column : internal_name_column, seq_column: internal_seq_column}) utils.process_input_line(line) unique_id = line[internal_name_column] ## Actually deal with colons properly since they come up VERY OFTEN in sequence IDs unique_id = unique_id.replace(":", "_") if any(fc in unique_id for fc in utils.forbidden_characters): raise Exception('found a forbidden character (one of %s) in sequence id \'%s\' -- sorry, you\'ll have to replace it with something else' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), unique_id)) if abbreviate_names: unique_id = abbreviate(used_names, potential_names, unique_id) # if command line specified query or reco ids, skip other ones if queries is not None and unique_id not in queries: continue if reco_ids is not None and line['reco_id'] not in reco_ids: continue if unique_id in input_info: raise Exception('found id %s twice in file %s' % (unique_id, fname)) if seed_unique_id is not None and unique_id == seed_unique_id: found_seed = True input_info[unique_id] = {'unique_id' : unique_id, 'seq' : line[internal_seq_column]} if n_queries == 0 and is_data and 'v_gene' in line: print ' note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % fname if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % fname) reco_info[unique_id] = copy.deepcopy(line) reco_info[unique_id]['unique_id'] = unique_id # in case we're abbreviating if glfo is not None: utils.add_implicit_info(glfo, reco_info[unique_id], multi_seq=False, existing_implicit_keys=('cdr3_length', )) # single seqs, since each seq is on its own line in the file n_queries += 1 if n_max_queries > 0 and n_queries >= n_max_queries: break if len(input_info) == 0: raise Exception('didn\'t end up pulling any input info out of %s while looking for queries: %s reco_ids: %s\n' % (fname, str(queries), str(reco_ids))) if seed_unique_id is not None and not found_seed: raise Exception('couldn\'t find seed %s in %s' % (seed_unique_id, fname)) return (input_info, reco_info)
#!/usr/bin/env python import csv import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(line) break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)
def _get_clonal_family_stats(path_to_annotations, metadata, use_np=False, use_immunized=False, locus=''): ''' get data statistics from partis annotations @param path_to_annotations: path to partis annotations @param metadata: path to partis metadata @param use_np: use nonproductive seqs? @param use_immunized: for Cui data, use immunized mice? @param locus: which locus to use @return list of dicts with clonal family sizes and naive seqs from processed data ''' partition_info = get_partition_info( path_to_annotations, metadata, ) if use_np: # return only nonproductive sequences # here "nonproductive" is defined as having a stop codon or being # out of frame or having a mutated conserved cysteine good_seq = lambda seqs: seqs['stops'] or not seqs['in_frames'] or seqs[ 'mutated_invariants'] else: # return all sequences good_seq = lambda seqs: [True for seq in seqs['seqs']] all_germline_dicts = [] for data_idx, data_info in enumerate(partition_info): if use_immunized and data_info['group'] != 'immunized': continue if not locus or data_info['locus'] != locus: continue PARTIS_PATH = os.path.dirname(os.path.realpath(__file__)) + '/partis' sys.path.insert(1, PARTIS_PATH + '/python') from utils import add_implicit_info, process_input_line import glutils glfo = glutils.read_glfo(data_info['germline_file'], locus=data_info['locus']) with open(data_info['annotations_file'], "r") as csvfile: reader = csv.DictReader(csvfile) for idx, line in enumerate(reader): # add goodies from partis if len(line['input_seqs']) == 0: # sometimes data will have empty clusters continue process_input_line(line) add_implicit_info(glfo, line) good_seq_idx = [ i for i, is_good in enumerate(good_seq(line)) if is_good ] if not good_seq_idx: # no nonproductive sequences... skip continue else: all_germline_dicts.append({ 'n_taxa': len(good_seq_idx), 'germline_sequence': disambiguate(line['v_gl_seq'].lower()), 'germline_name': '-'.join([line['v_gene'], str(idx)]), 'v_call': line['v_gene'], }) return all_germline_dicts
def get_mature_line(sfos, naive_line, glfo, nodefo, dtree, target_sfos, locus=None): assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in sfos: mline = utils.get_non_implicit_copy(naive_line) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [sfo['seq']] mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline try: utils.add_implicit_info(glfo, mline) except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file print 'implicit info adding failed for ievent %d in %s' % ( ievent, outdir) lines = traceback.format_exception(*sys.exc_info()) print utils.pad_lines( ''.join(lines) ) # NOTE this will still crash on the next line if implicit info adding failed final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in sfos], reco_info) ftree = copy.deepcopy(dtree) if locus is not None: def ltr(u): return u + '-' + locus new_nodefo = {} for u_old in nodefo: new_nodefo[ltr(u_old)] = nodefo[u_old] nodefo = new_nodefo treeutils.translate_labels(ftree, [(u, ltr(u)) for u in final_line['unique_ids']]) final_line['unique_ids'] = [ ltr(u) for u in final_line['unique_ids'] ] assert len(sfos) == len(final_line['unique_ids']) for iseq, sfo in enumerate(sfos): naive_id = naive_line['unique_ids'][0] assert naive_id.count('-') == 1 bstr = naive_id.replace('-' + locus, '') pids = final_line['paired-uids'][iseq] assert len(pids) == 1 and pids[0].find( bstr ) == 0 and pids[0].count('-') == 1 and pids[0].split( '-' )[1] in utils.loci # if uid is xxx-igh, paired id shoud be e.g. xxx-igk final_line['paired-uids'][iseq] = [ p.replace(bstr, sfo['name']) for p in pids ] if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if len( set(nodefo) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [ 1. / nodefo[u]['kd'] for u in final_line['unique_ids'] ] final_line['relative_affinities'] = [ 1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids'] ] final_line['lambdas'] = [ nodefo[u]['lambda'] for u in final_line['unique_ids'] ] final_line['nearest_target_indices'] = [ nodefo[u]['target_index'] for u in final_line['unique_ids'] ] ftree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree), padwidth=12) final_line['tree'] = ftree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo ) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything final_line['target_seqs'] = [tfo['seq'] for tfo in target_sfos] return final_line
def extract_seqs(fnam, uid2iso): '''Reads a partis cluster-annotations files and extrats relevant information and sequences.''' # Read cluster annotations into a data list of dictionaries: with open(fnam) as fh: reader = csv.DictReader(fh) data = list(reader) sequences_i = list() info_i = list() for row in data: fnam_base = fnam.split('_partitions')[0] cwd = os.getcwd() if 'IgK' in fnam_base: locus = 'igk' elif 'IgL' in fnam_base: locus = 'igl' else: locus = 'igh' # Process the partis data row and add germline information: try: utils.process_input_line(row) # Read default germline info glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(cwd, fnam_base), locus=locus) utils.add_implicit_info(glfo, row) except Exception as e: # Skip rows that cannot be processed print('First skip') print(e) continue uids = [dl + [u] if (len(dl) > 0 and dl[0] != '') else [u] for dl, u in zip(row['duplicates'], row['unique_ids'])] # Extract the full N padded naive sequence, # and find the v -and j gene bound on this naive sequence: cdr3_bounds = (row['codon_positions']['v'], row['codon_positions']['j'] + 3) vj_bounds = (row['regional_bounds']['v'][0], row['regional_bounds']['j'][1]) if row['invalid'] is True or (cdr3_bounds[0]-cdr3_bounds[1])%3 != 0: print('Invalid clonal family, skipping.') continue naiveDNA = row['naive_seq'] if repair_seq(naiveDNA, naiveDNA, vj_bounds, keep_check=True) is False: # Skip naive sequences too short or with stop codons: # print('Third skip') if len(row['input_seqs'][:]) > 100: print('Bad naive even after 100 seqs in clonal family.') repair_seq_debug(naiveDNA, naiveDNA, vj_bounds) continue trimmed_naiveDNA = repair_seq(naiveDNA[:], naiveDNA[:], vj_bounds) naiveAA = str(Seq(trimmed_naiveDNA, generic_dna).translate()) # There has been a name change and this try/except # is meant to provide backwards compatability: try: lseq = row['input_seqs'][:] except: lseq = row['seqs'][:] ir_lseq = row['indel_reversed_seqs'] stop_seq = row['stops'] assert(len(lseq) == len(ir_lseq)) assert(len(lseq) == len(stop_seq)) # Only keep sequences without indels and stop codons and minimum length amino acid length: ### ir_lseq[i] == '' or lseq[i] == ir_lseq[i] <-- No indels ### stop_seq[i] <-- No partis annotated stops (there seems still to be stops after these are removed though) ### repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True) <-- Checks whether the sequence is long enougth or have stop codons keep_idx = [1 if ((ir_lseq[i] == '' or lseq[i] == ir_lseq[i]) and stop_seq[i] is False and repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)) else 0 for i in range(len(lseq))] # Now only keep those sequences that passed QC: lseq = [s for s, keep in zip(lseq, keep_idx) if keep == 1] # Exclude small clonal families: if len(lseq) < MIN_OBS: # print(len(lseq)) # print('Fourth skip') continue # Get amino acid sequences: lAAseq = [str(Seq(repair_seq(s[:], naiveDNA[:], vj_bounds), generic_dna).translate()) for s in lseq] # mut_freqs = [s for s, keep in zip(row['mut_freqs'], keep_idx) if keep == 1] # print(row['n_mutations'].split(':')) Nmuts = [int(s) for s, keep in zip(row['n_mutations'].split(':'), keep_idx) if keep == 1] abundance = [len(d) for d, keep in zip(uids, keep_idx) if keep == 1] uids = [s for s, keep in zip(uids, keep_idx) if keep == 1] assert(len(Nmuts) == len(lseq)) assert(len(abundance) == len(lseq)) assert(len(uids) == len(lseq)) # assert(len(mut_freqs) == len(lseq)) # Convert frequency to counts and throw out info for discarded sequences: # Nmuts = [int(round(float(t[0])*len(t[1].strip('N')))) for i, t in enumerate(zip(mut_freqs, lseq))] # Deduplicate AAseqs and lseq according to the AA deduplication: ''' lAAseq_dict = dict() lAAseq_sort = dict() lseq_dedup = list() for i, aa in enumerate(lAAseq): if aa in lAAseq_sort: lAAseq_sort[aa].append((i, repair_seq(lseq[i][:], naiveDNA[:], vj_bounds), abundance[i])) else: lAAseq_sort[aa] = [(i, repair_seq(lseq[i][:], naiveDNA[:], vj_bounds), abundance[i])] for i, aa in enumerate(lAAseq_sort): lAAseq_dict[aa] = [t[0] for t in lAAseq_sort[aa]] s = sorted(lAAseq_sort[aa], ) ab_seq = sorted(lAAseq_sort[aa], key=lambda x: x[2], reverse=True)[0][1] lseq_dedup.append(ab_seq) assert(len(lAAseq_dict) == len(lseq_dedup)) # Make the deduplicated list and take the mutation rates, # as the mutation rate for the deduplicated sequence: lAAseq_dedup = list() Nmuts_dedup = list() abundance_dedup = list() for aa, idxs in lAAseq_dict.items(): lAAseq_dedup.append(aa) Nmut_list = [float(Nmuts[i]) for i in idxs] Nmuts_dedup.append(int(round(sum(Nmut_list)/len(Nmut_list)))) abundance_list = [abundance[i] for i in idxs] abundance_dedup.append(sum(abundance_list)) assert(len(lAAseq_dedup) == len(Nmuts_dedup)) assert(len(lAAseq_dedup) == len(abundance_dedup)) assert(len(lAAseq_dedup) == len(lseq_dedup)) # Exclude small clonal families: if len(lAAseq_dedup) < MIN_OBS: # print(len(lseq)) # print('Fourth skip') continue ''' iso_list = [[uid2iso[u] for u in ul] for ul in uids] # Store the results in a list: sequences_i.append(['naive_seq', naiveAA]) # This format is for ANARCI numbering info_i.append({'fnam': fnam, 'v_gene': row['v_gene'], 'd_gene': row['d_gene'], 'j_gene': row['j_gene'], 'naive_seq': naiveAA, 'naive_seq_DNA': trimmed_naiveDNA, 'Nmuts': Nmuts[:], 'abundance': abundance[:], 'AAseqs': lAAseq[:], 'DNAseqs': lseq[:], 'UID': uids[:], 'isotype': iso_list[:], 'CDR3_start': cdr3_bounds[0], 'CDR3_end': cdr3_bounds[1]}) return(sequences_i, info_i)
def process_cluster(args, cluster_line, seed_id, glfo): utils.add_implicit_info(glfo, cluster_line) if (seed_id is not None and not args.match_indel_in_uid and not args.ignore_seed_indels): check_seed_for_indels(cluster_line, seed_id, args.partition_file) # assume we want all seqs in cluster iseqs_to_keep = set(range(len(cluster_line["input_seqs"]))) # write out matching indel-containing seqs for visualization if --show-indel-in-trees if args.show_indel_in_trees: matching_iseqs = set( match_indel_in_uid_seq(cluster_line, args.show_indel_in_trees)) match_info = { "indel_match": [iseq in matching_iseqs for iseq in iseqs_to_keep] } # various cases where we downsample cluster sequences if args.match_indel_in_uid: iseqs_to_keep = iseqs_to_keep & set( match_indel_in_uid_seq(cluster_line, args.match_indel_in_uid)) if args.largest_cluster_across_partitions: """ Deduplicate sequence records. When using largest_cluster_across_partitions for seeded clusters, we may end up with duplicate sequences in these clusters because of how partis partitions seed clusters. If this option used, beware that this deduplication pays no respect to which duplicate record is preserved of two with the same unique id. """ iseqs_to_keep = iseqs_to_keep & set({ unique_id: iseq for iseq, unique_id in enumerate(cluster_line["unique_ids"]) }.values()) if args.remove_frameshifts or args.remove_stops or args.remove_mutated_invariants: iseqs_to_keep = iseqs_to_keep & set(apply_filters(args, cluster_line)) # apply merging of multiplicity info here (or flesh out with default values otherwise) multiplicity_seqmeta = get_multiplicity_seqmeta(cluster_line, args.upstream_seqmeta) # apply sequence downsampling here cluster_line["unique_seqs_count"] = len( iseqs_to_keep) # total in cluster output from partis always_include = set(args.always_include + [args.inferred_naive_name]) if args.max_sequences: iseqs_to_keep = iseqs_to_keep & set( downsample_iseqs_by_multiplicity( cluster_line, multiplicity_seqmeta, args.max_sequences, always_include)) cluster_line["sampled_seqs_count"] = len(iseqs_to_keep) # filter cluster line to iseqs_to_keep utils.restrict_to_iseqs(cluster_line, iseqs_to_keep, glfo) # add the additional info computed in above for the iseqs we care about cluster_line = add_additional_info(cluster_line, multiplicity_seqmeta, iseqs_to_keep) if args.show_indel_in_trees: cluster_line = add_additional_info(cluster_line, match_info, iseqs_to_keep) cluster_line["total_read_count"] = sum( cluster_line["multiplicities"] ) # total reads accounting for multiplicity (must be calculated after subsetting cluster in restrict_to_iseqs if it should correspond to total reads represented by subset of cluster returned by restrict_to_iseqs) # this needs to happen after restrict_to_iseqs re-adds implicit partis linekeys including 'regional_bounds' cluster_line, regional_bounds_keys = add_regional_bounds(cluster_line) return merge( subset_dict( cluster_line, regional_bounds_keys + [ "total_read_count", "sampled_seqs_count", "unique_seqs_count", "v_gene", "d_gene", "j_gene", "cdr3_length", "naive_seq", "v_per_gene_support", "d_per_gene_support", "j_per_gene_support", ], ), get_cluster_meta_dict(cluster_line, seed_id, args), )
def get_seqfile_info(infname, is_data, n_max_queries=-1, args=None, glfo=None, simglfo=None): """ return list of sequence info from files of several types """ if not is_data and glfo is None: print ' WARNING glfo is None, so not adding implicit info' suffix = os.path.splitext(infname)[1] if len(re.findall('\.[ct]sv', suffix)) > 0: if suffix == '.csv': delimiter = ',' elif suffix == '.tsv': delimiter = '\t' else: assert False seqfile = opener('r')(infname) reader = csv.DictReader(seqfile, delimiter=delimiter) else: if suffix == '.fasta' or suffix == '.fa': ftype = 'fasta' elif suffix == '.fastq' or suffix == '.fq': ftype = 'fastq' else: raise Exception('couldn\'t handle file extension for %s' % infname) reader = [] n_fasta_queries = 0 already_printed_forbidden_character_warning = False for seq_record in SeqIO.parse(infname, ftype): # if command line specified query or reco ids, skip other ones (can't have/don't allow simulation info in a fast[aq]) if args is not None and args.queries is not None and seq_record.name not in args.queries: continue reader.append({}) uid = seq_record.name if any(fc in uid for fc in utils.forbidden_characters): if not already_printed_forbidden_character_warning: print ' %s: found a forbidden character (one of %s) in sequence id \'%s\'. This means we\'ll be replacing each of these forbidden characters with a single letter from their name (in this case %s). If this will cause problems you should replace the characters with something else beforehand.' % (utils.color('yellow', 'warning'), ' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid, uid.translate(utils.forbidden_character_translations)) already_printed_forbidden_character_warning = True uid = uid.translate(utils.forbidden_character_translations) reader[-1]['unique_ids'] = uid reader[-1]['input_seqs'] = str(seq_record.seq).upper() n_fasta_queries += 1 if n_max_queries > 0 and n_fasta_queries >= n_max_queries: break input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() # already_printed_forbidden_character_warning = False n_queries_added = 0 found_seed = False used_names = set() # for abbreviating if args is not None and args.abbreviate: potential_names = list(string.ascii_lowercase) iname = None # line number -- used as sequence id if there isn't a name column in the file iline = -1 for line in reader: iline += 1 if args is not None: if args.istartstop is not None: if iline < args.istartstop[0]: continue if iline >= args.istartstop[1]: break if args.name_column is not None: line['unique_ids'] = line[args.name_column] del line[args.name_column] if args.seq_column is not None: line['input_seqs'] = line[args.seq_column] del line[args.seq_column] if iname is None and 'unique_ids' not in line and 'unique_id' not in line: print ' %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (utils.color('yellow', 'warning')) iname = 0 if iname is not None: line['unique_ids'] = '%09d' % iname iname += 1 if 'input_seqs' not in line and 'seqs' not in line and 'seq' not in line: raise Exception('couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname) utils.process_input_line(line) if len(line['unique_ids']) > 1: raise Exception('can\'t yet handle multi-seq csv input files') uid = line['unique_ids'][0] inseq = line['input_seqs'][0] # NOTE I just moved this to the .fa loop, since otherwise we have no way of knowing how to interpret special characters... nevertheless if someone passesin a csv with special characters as part of a uid this will break # if any(fc in uid for fc in utils.forbidden_characters): # if not already_printed_forbidden_character_warning: # print ' %s: found a forbidden character (one of %s) in sequence id \'%s\'. This means we\'ll be replacing each of these forbidden characters with a single letter from their name (in this case %s). If this will cause problems you should replace the characters with something else beforehand.' % (utils.color('yellow', 'warning'), ' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid, uid.translate(utils.forbidden_character_translations)) # already_printed_forbidden_character_warning = True # uid = uid.translate(utils.forbidden_character_translations) if args is not None: if args.abbreviate: # note that this changes <uid>, but doesn't modify <line> uid = abbreviate(used_names, potential_names, uid) if args.queries is not None and uid not in args.queries: continue if args.reco_ids is not None and line['reco_id'] not in args.reco_ids: continue if args.seed_unique_id is not None and uid == args.seed_unique_id: found_seed = True if uid in input_info: raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname)) if len(inseq.translate(None, ''.join(utils.alphabet))) > 0: raise Exception('unexpected character (not among %s) in input sequence with id %s:\n %s' % (utils.nukes + utils.ambiguous_bases, uid, inseq)) input_info[uid] = {'unique_ids' : [uid, ], 'seqs' : [inseq, ]} if n_queries_added == 0 and is_data and 'v_gene' in line: print ' note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % infname) reco_info[uid] = copy.deepcopy(line) if simglfo is not None: utils.add_implicit_info(simglfo, reco_info[uid]) n_queries_added += 1 if n_max_queries > 0 and n_queries_added >= n_max_queries: break if args is not None: if args.istartstop is not None: n_lines_in_file = iline + 1 if n_lines_in_file < args.istartstop[1]: raise Exception('--istartstop upper bound %d larger than number of lines in file %d' % (args.istartstop[1], n_lines_in_file)) if len(input_info) == 0: if args.queries is not None: raise Exception('didn\'t find the specified --queries (%s) in %s' % (str(args.queries), infname)) if args.reco_ids is not None: raise Exception('didn\'t find the specified --reco-ids (%s) in %s' % (str(args.reco_ids), infname)) if args.queries is not None: missing_queries = set(args.queries) - set(input_info) extra_queries = set(input_info) - set(args.queries) # this is just checking for a bug in the code just above here... if len(missing_queries) > 0: raise Exception('didn\'t find some of the specified --queries: %s' % ' '.join(missing_queries)) if len(extra_queries) > 0: raise Exception('extracted uids %s that weren\'t specified with --queries' % ' '.join(extra_queries)) if args.seed_unique_id is not None: if found_seed: if args.seed_seq is not None: # and input_info[args.seed_unique_id]['seqs'][0] != args.seed_seq: # raise Exception('incompatible --seed-unique-id and --seed-seq (i.e. the sequence in %s corresponding to %s wasn\'t %s)' % (infname, args.seed_unique_id, args.seed_seq)) raise Exception('--seed-seq was specified, but --seed-unique-id was also present in input file') else: if args.seed_seq is None: raise Exception('couldn\'t find seed unique id %s in %s' % (args.seed_unique_id, infname)) add_seed_seq(args, input_info, reco_info, is_data) elif args.seed_seq is not None: args.seed_unique_id = 'seed-seq' add_seed_seq(args, input_info, reco_info, is_data) elif args.random_seed_seq: # already checked (in bin/partis) that other seed args aren't set args.seed_unique_id = random.choice(input_info.keys()) print ' chose random seed unique id %s' % args.seed_unique_id if len(input_info) == 0: raise Exception('didn\'t read any sequences from %s' % infname) return input_info, reco_info