def generate_simulated_beta_seqs( params_file_name='tcrdist/default_models/human_T_beta/model_params.txt', marginals_file_name='tcrdist/default_models/human_T_beta/model_marginals.txt', V_anchor_pos_file='tcrdist/default_models/human_T_beta/V_gene_CDR3_anchors.csv', J_anchor_pos_file='tcrdist/default_models/human_T_beta/J_gene_CDR3_anchors.csv', output_cols=['cdr3_b_aa', "v_b_gene", 'j_b_gene'], n=100000): #Load data genomic_data = load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) #Load model generative_model = load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) seq_gen_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data) #Generate some random sequences vs = [x[0] for x in genomic_data.__dict__['genV']] js = [x[0] for x in genomic_data.__dict__['genJ']] vs = {i: k for i, k in enumerate(vs)} js = {i: k for i, k in enumerate(js)} sim_cdr3 = [seq_gen_model.gen_rnd_prod_CDR3()[1:4] for x in range(n)] sim_cdr3_long = [(i, vs[v], js[j]) for i, v, j in sim_cdr3] df = pd.DataFrame(sim_cdr3_long, columns=output_cols) return df
def sample_olga(num_gen_seqs=1, chain_index=0, ppost=False, seed=None): if seed is not None: np.random.seed(seed) else: np.random.seed() num_gen_seqs = np.min([num_gen_seqs, 1000]) chain_type = options_of[chain_index] main_folder = os.path.join(local_directory, 'default_models', chain_type) params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if options_of[chain_index] in vj_chains: genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVJ(generative_model, genomic_data) else: genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data) if not bool(ppost): return [ [ seq[0], seq[1], genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0] ] for seq in [sg_model.gen_rnd_prod_CDR3() for _ in range(int(num_gen_seqs))] ] else: qm = MinimalSonia(qfiles[chain_index], norms[chain_index][1]) seqs_post = [['a', 'b', 'c', 'd']] # initialize while len(seqs_post) < num_gen_seqs: seqs = [[ seq[0], seq[1], genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0] ] for seq in [ sg_model.gen_rnd_prod_CDR3() for _ in range(int(11 * num_gen_seqs)) ]] Qs = qm.compute_sel_factor(list(np.array(seqs)[:, 1:])) random_samples = np.random.uniform( size=len(Qs)) # sample from uniform distribution #do rejection rejection_selection = random_samples < np.clip(Qs, 0, 10) / 10. print( np.sum(rejection_selection) / float(len(rejection_selection))) seqs_post = np.concatenate( [seqs_post, np.array(seqs)[rejection_selection]]) return seqs_post[1:num_gen_seqs + 1]
def __init__(self, sonia_model=None, custom_olga_model=None, custom_genomic_data=None): if type(sonia_model) == str or sonia_model is None: print('ERROR: you need to pass a Sonia object') return self.sonia_model = sonia_model # sonia model passed as an argument # define olga sequence_generation model if custom_olga_model is not None: if custom_genomic_data is None: print('ERROR: you need to pass also the custom_genomic_data') return self.genomic_data = custom_genomic_data self.seq_gen_model = custom_olga_model else: main_folder = os.path.join( os.path.dirname(olga_load_model.__file__), 'default_models', self.sonia_model.chain_type) params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if self.sonia_model.chain_type != 'human_T_alpha': self.genomic_data = olga_load_model.GenomicDataVDJ() self.genomic_data.load_igor_genomic_data( params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model( marginals_file_name) self.seq_gen_model = seq_gen.SequenceGenerationVDJ( generative_model, self.genomic_data) else: self.genomic_data = olga_load_model.GenomicDataVJ() self.genomic_data.load_igor_genomic_data( params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model( marginals_file_name) self.seq_gen_model = seq_gen.SequenceGenerationVJ( generative_model, self.genomic_data) # you need Z for rejection selection and generate sequences ppost --> compute only once self.energies_gen = self.sonia_model.compute_energy( self.sonia_model.gen_seq_features) self.Z = np.sum(np.exp(-self.energies_gen)) / len(self.energies_gen)
def generate(self, num_seqs): """Generate a given number of CDR3 sequences through OLGA. Parameters ---------- num_seqs : int An integer specifying the number of sequences to generate. Returns ------- pandas.DataFrame Containing columns with sequence index, nucleotide CDR3 sequence, amino acid CDR3 sequence, the index of the chosen V gene and the index of the chosen J gene. Raises ------ TypeError When the model type does not equal 'VDJ' or 'VJ'. """ # Create the dataframe and set the generation objects. generated_seqs = pandas.DataFrame(columns=[ self.col_names['NT_COL'], self.col_names['AA_COL'], self. col_names['V_GENE_CHOICE_COL'], self.col_names['J_GENE_CHOICE_COL'] ]) seq_gen_model = None if self.igor_model.get_type() == "VDJ": seq_gen_model = olga_seq_gen.SequenceGenerationVDJ( self.igor_model.get_generative_model(), self.igor_model.get_genomic_data()) elif self.igor_model.get_type() == "VJ": seq_gen_model = olga_seq_gen.SequenceGenerationVJ( self.igor_model.get_generative_model(), self.igor_model.get_genomic_data()) else: raise TypeError( "OLGA could not create a SequenceGeneration object since model is not of type 'VDJ' or 'VJ'" ) # Generate the sequences, add them to the dataframe and return. for _ in range(num_seqs): generated_seq = seq_gen_model.gen_rnd_prod_CDR3() generated_seqs = generated_seqs.append( { self.col_names['NT_COL']: generated_seq[0], self.col_names['AA_COL']: generated_seq[1], self.col_names['V_GENE_CHOICE_COL']: self.igor_model.get_genomic_data().genV[ generated_seq[2]][0], self.col_names['J_GENE_CHOICE_COL']: self.igor_model.get_genomic_data().genJ[generated_seq[3]] [0] }, ignore_index=True) return generated_seqs
def define_olga_models(self,olga_model=None): """Defines Olga pgen and seqgen models and keeps them as attributes. Parameters ---------- olga_model: string Path to a folder specifying a custom IGoR formatted model to be used as a generative model. Folder must contain 'model_params.txt', model_marginals.txt','V_gene_CDR3_anchors.csv' and 'J_gene_CDR3_anchors.csv'. Attributes set -------------- genomic_data: object genomic data associate with the olga model. pgen_model: object olga model for evaluation of pgen. seq_gen_model: object olga model for generation of seqs. """ #Load generative model if olga_model is not None: try: # relative path pathdir= os.getcwd() main_folder = os.path.join(pathdir,olga_model) os.path.isfile(os.path.join(main_folder,'model_params.txt')) except: # absolute path main_folder=olga_model else: main_folder=os.path.join(os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type) params_file_name = os.path.join(main_folder,'model_params.txt') marginals_file_name = os.path.join(main_folder,'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder,'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder,'J_gene_CDR3_anchors.csv') genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) self.genomic_data=genomic_data generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) self.pgen_model = pgen.GenerationProbabilityVDJ(generative_model, genomic_data) self.pgen_model.V_mask_mapping=self.complement_V_mask(self.pgen_model) self.seq_gen_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data)
def define_olga_models(self, olga_model=None): """ Defines Olga pgen and seqgen models and keeps them as attributes. """ import olga.load_model as load_model import olga.generation_probability as pgen import olga.sequence_generation as seq_gen #Load generative model if olga_model is not None: try: # relative path pathdir = os.getcwd() main_folder = os.path.join(pathdir, olga_model) os.path.isfile(os.path.join(main_folder, 'model_params.txt')) except: # absolute path main_folder = olga_model else: main_folder = os.path.join(os.path.dirname(load_model.__file__), 'default_models', self.chain_type) params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') genomic_data = load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) self.genomic_data = genomic_data generative_model = load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) self.pgen_model = pgen.GenerationProbabilityVDJ( generative_model, genomic_data) self.pgen_model.V_mask_mapping = self.complement_V_mask( self.pgen_model) self.seq_gen_model = seq_gen.SequenceGenerationVDJ( generative_model, genomic_data)
def main(): """ Generate sequences.""" parser = OptionParser(conflict_handler="resolve") parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)') parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)') parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)') parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)') parser.add_option( '--VDJ_model_folder', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model') parser.add_option( '--VJ_model_folder', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model') parser.add_option('-o', '--outfile', dest='outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences to PATH/TO/FILE') parser.add_option('-n', '--num_seqs', type='float', metavar='N', default=0, dest='num_seqs_to_generate', help='specify the number of sequences to generate.') parser.add_option( '--seed', type='int', dest='seed', help= 'set seed for pseudorandom number generator. Default is to not set a seed.' ) parser.add_option( '--seqs_per_time_update', type='float', default=100000, dest='seqs_per_time_update', help= 'specify the number of sequences between time updates. Default is 1e5') parser.add_option('--conserved_J_residues', type='string', default='FVW', dest='conserved_J_residues', help="specify conserved J residues. Default is 'FVW'.") parser.add_option('--time_updates_off', action='store_false', dest='time_updates', default=True, help='turn time updates off.') parser.add_option( '--seq_type', type='choice', default='all', dest='seq_type', choices=['all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'], help= "declare sequence type for output sequences. Choices: 'all' [default], 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'" ) parser.add_option('--record_genes_off', action='store_false', dest="record_genes", default=True, help='turn off recording V and J gene info.') parser.add_option( '-d', '--delimiter', type='choice', dest='delimiter', choices=['tab', 'space', ',', ';', ':'], help= "declare delimiter choice. Default is tab for .tsv output files, comma for .csv files, and tab for all others. Choices: 'tab', 'space', ',', ';', ':'" ) parser.add_option('--raw_delimiter', type='str', dest='delimiter', help="declare delimiter choice as a raw string.") (options, args) = parser.parse_args() main_folder = os.path.dirname(__file__) default_models = {} default_models['humanTRA'] = [ os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ' ] default_models['humanTRB'] = [ os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ' ] default_models['mouseTRB'] = [ os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ' ] default_models['humanIGH'] = [ os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ' ] num_models_specified = sum([ 1 for x in list(default_models.keys()) + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x) ]) if num_models_specified == 1: #exactly one model specified try: d_model = [ x for x in list(default_models.keys()) if getattr(options, x) ][0] model_folder = default_models[d_model][0] recomb_type = default_models[d_model][1] except IndexError: if options.vdj_model_folder: #custom VDJ model specified model_folder = options.vdj_model_folder recomb_type = 'VDJ' elif options.vj_model_folder: #custom VJ model specified model_folder = options.vj_model_folder recomb_type = 'VJ' elif num_models_specified == 0: print('Need to indicate generative model.') print('Exiting...') return -1 elif num_models_specified > 1: print('Only specify one model') print('Exiting...') return -1 #Check that all model and genomic files exist in the indicated model folder if not os.path.isdir(model_folder): print('Check pathing... cannot find the model folder: ' + model_folder) print('Exiting...') return -1 params_file_name = os.path.join(model_folder, 'model_params.txt') marginals_file_name = os.path.join(model_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(model_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(model_folder, 'J_gene_CDR3_anchors.csv') for x in [ params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file ]: if not os.path.isfile(x): print('Cannot find: ' + x) print( 'Please check the files (and naming conventions) in the model folder ' + model_folder) print('Exiting...') return -1 if options.outfile_name is not None: outfile_name = options.outfile_name if os.path.isfile(outfile_name): if not input(outfile_name + ' already exists. Overwrite (y/n)? ' ).strip().lower() in ['y', 'yes']: print('Exiting...') return -1 #Parse arguments num_seqs_to_generate = int(options.num_seqs_to_generate) if num_seqs_to_generate <= 0: print('Need to specify num_seqs (number of sequences to generate).') print('Exiting...') return -1 #Parse default delimiter delimiter = options.delimiter if delimiter is None: delimiter = '\t' if options.outfile_name is not None: if outfile_name.endswith('.tsv'): delimiter = '\t' elif outfile_name.endswith('.csv'): delimiter = ',' else: try: delimiter = { 'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':' }[delimiter] except KeyError: pass #Other raw string. #Optional flags seq_type = { 'all': 'all', 'ntseq': 'ntseq', 'nucleotide': 'ntseq', 'aaseq': 'aaseq', 'amino_acid': 'aaseq' }[options.seq_type] record_genes = options.record_genes seqs_per_time_update = int(options.seqs_per_time_update) time_updates = options.time_updates conserved_J_residues = options.conserved_J_residues if options.seed is not None: np.random.seed(options.seed) #VDJ recomb case --- used for TCRB and IGH if recomb_type == 'VDJ': genomic_data = load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) seq_gen = sequence_generation.SequenceGenerationVDJ( generative_model, genomic_data) #VJ recomb case --- used for TCRA and light chain elif recomb_type == 'VJ': genomic_data = load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) seq_gen = sequence_generation.SequenceGenerationVJ( generative_model, genomic_data) V_gene_names = [V[0].split('*')[0] for V in genomic_data.genV] J_gene_names = [J[0].split('*')[0] for J in genomic_data.genJ] if options.outfile_name is not None: outfile = open(outfile_name, 'w') print('Starting sequence generation... ') start_time = time.time() for i in range(num_seqs_to_generate): ntseq, aaseq, V_in, J_in = seq_gen.gen_rnd_prod_CDR3( conserved_J_residues) if seq_type == 'all': #default, include both ntseq and aaseq current_line_out = ntseq + delimiter + aaseq elif seq_type == 'ntseq': #only record ntseq current_line_out = ntseq elif seq_type == 'aaseq': #only record aaseq current_line_out = aaseq if record_genes: current_line_out += delimiter + V_gene_names[ V_in] + delimiter + J_gene_names[J_in] outfile.write(current_line_out + '\n') if (i + 1) % seqs_per_time_update == 0 and time_updates: c_time = time.time() - start_time eta = ((num_seqs_to_generate - (i + 1)) / float(i + 1)) * c_time if c_time > 86400: #more than a day c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.' % ( int(c_time) / 86400, (int(c_time) / 3600) % 24, (int(c_time) / 60) % 60, c_time % 60) elif c_time > 3600: #more than an hr c_time_str = '%d hours, %d minutes, and %.2f seconds.' % ( (int(c_time) / 3600) % 24, (int(c_time) / 60) % 60, c_time % 60) elif c_time > 60: #more than a min c_time_str = '%d minutes and %.2f seconds.' % ( (int(c_time) / 60) % 60, c_time % 60) else: c_time_str = '%.2f seconds.' % (c_time) if eta > 86400: #more than a day eta_str = '%d days, %d hours, %d minutes, and %.2f seconds.' % ( int(eta) / 86400, (int(eta) / 3600) % 24, (int(eta) / 60) % 60, eta % 60) elif eta > 3600: #more than an hr eta_str = '%d hours, %d minutes, and %.2f seconds.' % ( (int(eta) / 3600) % 24, (int(eta) / 60) % 60, eta % 60) elif eta > 60: #more than a min eta_str = '%d minutes and %.2f seconds.' % ( (int(eta) / 60) % 60, eta % 60) else: eta_str = '%.2f seconds.' % (eta) print( '%d sequences generated in %s Estimated time remaining: %s' % (i + 1, c_time_str, eta_str)) c_time = time.time() - start_time if c_time > 86400: #more than a day c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.' % ( int(c_time) / 86400, (int(c_time) / 3600) % 24, (int(c_time) / 60) % 60, c_time % 60) elif c_time > 3600: #more than an hr c_time_str = '%d hours, %d minutes, and %.2f seconds.' % ( (int(c_time) / 3600) % 24, (int(c_time) / 60) % 60, c_time % 60) elif c_time > 60: #more than a min c_time_str = '%d minutes and %.2f seconds.' % ( (int(c_time) / 60) % 60, c_time % 60) else: c_time_str = '%.2f seconds.' % (c_time) print('Completed generating all %d sequences in %s' % (num_seqs_to_generate, c_time_str)) outfile.close() else: #print to stdout for i in range(num_seqs_to_generate): ntseq, aaseq, V_in, J_in = seq_gen.gen_rnd_prod_CDR3( conserved_J_residues) if seq_type == 'all': #default, include both ntseq and aaseq current_line_out = ntseq + delimiter + aaseq elif seq_type == 'ntseq': #only record ntseq current_line_out = ntseq elif seq_type == 'aaseq': #only record aaseq current_line_out = aaseq if record_genes: current_line_out += delimiter + V_gene_names[ V_in] + delimiter + J_gene_names[J_in] print(current_line_out)
def __init__(self, sonia_model=None, custom_olga_model=None, custom_genomic_data=None): if type(sonia_model) == str or sonia_model is None: print('ERROR: you need to pass a Sonia object') return self.sonia_model = sonia_model # sonia model passed as an argument # define olga sequence_generation model if custom_olga_model is not None: if type(custom_olga_model) == str: print( 'ERROR: you need to pass a olga object for the seq_gen model' ) return if custom_genomic_data is None: print('ERROR: you need to pass also the custom_genomic_data') return if type(custom_genomic_data) == str: print( 'ERROR: you need to pass a olga object for the genomic_data' ) return self.genomic_data = custom_genomic_data self.seq_gen_model = custom_olga_model else: try: if self.sonia_model.custom_pgen_model is None: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.sonia_model.chain_type) else: main_folder = self.sonia_model.custom_pgen_model except: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.sonia_model.chain_type) params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if not self.sonia_model.vj: self.genomic_data = olga_load_model.GenomicDataVDJ() self.genomic_data.load_igor_genomic_data( params_file_name, V_anchor_pos_file, J_anchor_pos_file) self.generative_model = olga_load_model.GenerativeModelVDJ() self.generative_model.load_and_process_igor_model( marginals_file_name) self.seq_gen_model = seq_gen.SequenceGenerationVDJ( self.generative_model, self.genomic_data) else: self.genomic_data = olga_load_model.GenomicDataVJ() self.genomic_data.load_igor_genomic_data( params_file_name, V_anchor_pos_file, J_anchor_pos_file) self.generative_model = olga_load_model.GenerativeModelVJ() self.generative_model.load_and_process_igor_model( marginals_file_name) self.seq_gen_model = seq_gen.SequenceGenerationVJ( self.generative_model, self.genomic_data)
def main(): """ Generate sequences.""" parser = OptionParser(conflict_handler="resolve") #specify model parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)') parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)') parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)') parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)') parser.add_option('--humanIGK', '--human_B_kappa', action='store_true', dest='humanIGK', default=False, help='use default human IGK model (B cell light kappa chain)') parser.add_option('--humanIGL', '--human_B_lambda', action='store_true', dest='humanIGL', default=False, help='use default human IGL model (B cell light lambda chain)') parser.add_option('--mouseTRA', '--mouse_T_alpha', action='store_true', dest='mouseTRA', default=False, help='use default mouse TRA model (T cell alpha chain)') parser.add_option('--set_custom_model_VDJ', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model') parser.add_option('--set_custom_model_VJ', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model') parser.add_option('--sonia_model', type='string', default = 'leftright', dest='model_type' ,help=' specify model type: leftright or lengthpos, default is leftright') parser.add_option('--post', '--ppost', action='store_true', dest='ppost', default=False, help='sample from post selected repertoire') parser.add_option('--pre', '--pgen', action='store_true', dest='pgen', default=False, help='sample from pre selected repertoire ') parser.add_option('--delimiter_out','-d', type='choice', dest='delimiter_out', choices=['tab', 'space', ',', ';', ':'], help="declare outfile delimiter. Default is tab for .tsv output files, comma for .csv files, and the infile delimiter for all others. Choices: 'tab', 'space', ',', ';', ':'") parser.add_option('-s','--chunk_size', type='int',metavar='N', dest='chunck_size', default = int(1e3), help='Number of sequences to generate at each iteration') parser.add_option('-r','--rejection_bound', type='int',metavar='N', dest='rejection_bound', default = 10, help='limit above which sequences are always accepted.') # input output parser.add_option('-o', '--outfile', dest = 'outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences to PATH/TO/FILE') parser.add_option('-n', '--N', type='int',metavar='N', dest='num_seqs_to_generate',default=1, help='Number of sequences to sample from.') (options, args) = parser.parse_args() #Check that the model is specified properly main_folder = os.path.dirname(__file__) default_models = {} default_models['humanTRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ'] default_models['humanTRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ'] default_models['mouseTRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ'] default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ'] default_models['humanIGK'] = [os.path.join(main_folder, 'default_models', 'human_B_kappa'), 'VJ'] default_models['humanIGL'] = [os.path.join(main_folder, 'default_models', 'human_B_lambda'), 'VJ'] default_models['mouseTRA'] = [os.path.join(main_folder, 'default_models', 'mouse_T_alpha'), 'VJ'] num_models_specified = sum([1 for x in list(default_models.keys()) + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)]) if num_models_specified == 1: #exactly one model specified try: d_model = [x for x in default_models.keys() if getattr(options, x)][0] model_folder = default_models[d_model][0] recomb_type = default_models[d_model][1] except IndexError: if options.vdj_model_folder: #custom VDJ model specified model_folder = options.vdj_model_folder recomb_type = 'VDJ' elif options.vj_model_folder: #custom VJ model specified model_folder = options.vj_model_folder recomb_type = 'VJ' elif num_models_specified == 0: print('Need to indicate generative model.') print('Exiting...') return -1 elif num_models_specified > 1: print('Only specify one model') print('Exiting...') return -1 #Parse delimiter_out delimiter_out = options.delimiter_out if delimiter_out is None: #Default case delimiter_out = '\t' if options.outfile_name is None: pass elif options.outfile_name.endswith('.tsv'): #output TAB separated value file delimiter_out = '\t' elif options.outfile_name.endswith('.csv'): #output COMMA separated value file delimiter_out = ',' else: try: delimiter_out = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter_out] except KeyError: pass #Other string passed as the delimiter. #Generative model specification -- note we'll probably change this syntax to #allow for arbitrary model file specification params_file_name = os.path.join(model_folder,'model_params.txt') marginals_file_name = os.path.join(model_folder,'model_marginals.txt') V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv') for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]: if not os.path.isfile(x): print('Cannot find: ' + x) print('Please check the files (and naming conventions) in the model folder ' + model_folder) print('Exiting...') return -1 #Load up model based on recomb_type #VDJ recomb case --- used for TCRB and IGH if recomb_type == 'VDJ': genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) seqgen_model = sequence_generation.SequenceGenerationVDJ(generative_model, genomic_data) #VJ recomb case --- used for TCRA and light chain elif recomb_type == 'VJ': genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) seqgen_model = sequence_generation.SequenceGenerationVJ(generative_model, genomic_data) if options.pgen:sonia_model=SoniaLeftposRightpos() else:sonia_model=SoniaLeftposRightpos(feature_file=os.path.join(model_folder,'features.tsv'),log_file=os.path.join(model_folder,'log.txt'),vj=recomb_type == 'VJ') # load Evaluate model class seq_gen=SequenceGeneration(sonia_model,custom_olga_model=seqgen_model,custom_genomic_data=genomic_data) if options.outfile_name is not None: #OUTFILE SPECIFIED with open(options.outfile_name,'w') as file: to_generate=chuncks(options.num_seqs_to_generate,options.chunck_size) for t in tqdm(to_generate): if options.pgen: seqs=seq_gen.generate_sequences_pre(num_seqs=t,nucleotide=True) elif options.ppost: seqs=seq_gen.generate_sequences_post(num_seqs=t,nucleotide=True,upper_bound=options.rejection_bound) else: print ('ERROR: give option between --pre or --post') return -1 for seq in seqs: file.write(seq[0]+delimiter_out+seq[1]+delimiter_out+seq[2]+delimiter_out+seq[3]+'\n') # np.savetxt(options.outfile_name,seqs,fmt='%s') else: #print to stdout to_generate=chuncks(options.num_seqs_to_generate,options.chunck_size) for t in to_generate: if options.pgen: seqs=seq_gen.generate_sequences_pre(num_seqs=t,nucleotide=True) elif options.ppost: seqs=seq_gen.generate_sequences_post(num_seqs=t,nucleotide=True,upper_bound=options.rejection_bound) else: print ('ERROR: give option between --pre or --post') return -1 for seq in seqs: print(seq[0],seq[1],seq[2],seq[3])
def add_generated_seqs(self, num_gen_seqs=0, reset_gen_seqs=True, custom_model_folder=None): """Generates MonteCarlo sequences for gen_seqs using OLGA. Only generates seqs from a V(D)J model. Requires the OLGA package (pip install olga). Parameters ---------- num_gen_seqs : int or float Number of MonteCarlo sequences to generate and add to the specified sequence pool. custom_model_folder : str Path to a folder specifying a custom IGoR formatted model to be used as a generative model. Folder must contain 'model_params.txt' and 'model_marginals.txt' Attributes set -------------- gen_seqs : list MonteCarlo sequences drawn from a VDJ recomb model gen_seq_features : list Features gen_seqs have been projected onto. """ #Load generative model if custom_model_folder is None: main_folder = os.path.join( os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type) else: main_folder = custom_model_folder params_file_name = os.path.join(main_folder, 'model_params.txt') marginals_file_name = os.path.join(main_folder, 'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv') if not os.path.isfile(params_file_name) or not os.path.isfile( marginals_file_name): print 'Cannot find specified custom generative model files: ' + '\n' + params_file_name + '\n' + marginals_file_name print 'Exiting sequence generation...' return None if not os.path.isfile(V_anchor_pos_file): V_anchor_pos_file = os.path.join( os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'V_gene_CDR3_anchors.csv') if not os.path.isfile(J_anchor_pos_file): J_anchor_pos_file = os.path.join( os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'J_gene_CDR3_anchors.csv') if self.chain_type.endswith('TRA'): genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVJ(generative_model, genomic_data) else: genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data) #Generate sequences seqs = [ [ seq[1], genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0] ] for seq in [sg_model.gen_rnd_prod_CDR3() for _ in range(int(num_gen_seqs))] ] if reset_gen_seqs: #reset gen_seqs if needed self.gen_seqs = [] #Add to specified pool(s) self.update_model(add_gen_seqs=seqs)
def add_generated_seqs(self, num_gen_seqs = 0, reset_gen_seqs = True, custom_model_folder = None, add_error=False,custom_error=None): """Generates MonteCarlo sequences for gen_seqs using OLGA. Only generates seqs from a V(D)J model. Requires the OLGA package (pip install olga). Parameters ---------- num_gen_seqs : int or float Number of MonteCarlo sequences to generate and add to the specified sequence pool. custom_model_folder : str Path to a folder specifying a custom IGoR formatted model to be used as a generative model. Folder must contain 'model_params.txt' and 'model_marginals.txt' add_error: bool simualate sequencing error: default is false custom_error: int set custom error rate for sequencing error. Default is the one inferred by igor. Attributes set -------------- gen_seqs : list MonteCarlo sequences drawn from a VDJ recomb model gen_seq_features : list Features gen_seqs have been projected onto. """ from sonia.utils import add_random_error from olga.utils import nt2aa #Load generative model if custom_model_folder is None: try: if self.custom_pgen_model is None: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.chain_type) else: main_folder=self.custom_pgen_model except: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.chain_type) else: main_folder = custom_model_folder params_file_name = os.path.join(main_folder,'model_params.txt') marginals_file_name = os.path.join(main_folder,'model_marginals.txt') V_anchor_pos_file = os.path.join(main_folder,'V_gene_CDR3_anchors.csv') J_anchor_pos_file = os.path.join(main_folder,'J_gene_CDR3_anchors.csv') if not os.path.isfile(params_file_name) or not os.path.isfile(marginals_file_name): print('Cannot find specified custom generative model files: ' + '\n' + params_file_name + '\n' + marginals_file_name) print('Exiting sequence generation...') return None if not os.path.isfile(V_anchor_pos_file): V_anchor_pos_file = os.path.join(os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'V_gene_CDR3_anchors.csv') if not os.path.isfile(J_anchor_pos_file): J_anchor_pos_file = os.path.join(os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'J_gene_CDR3_anchors.csv') with open(params_file_name,'r') as file: sep=0 error_rate='' lines=file.read().splitlines() while len(error_rate)<1: error_rate=lines[-1+sep] sep-=1 if custom_error is None: self.error_rate=float(error_rate) else: self.error_rate=custom_error if self.vj: genomic_data = olga_load_model.GenomicDataVJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVJ(generative_model, genomic_data) else: genomic_data = olga_load_model.GenomicDataVDJ() genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file) generative_model = olga_load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) sg_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data) #Generate sequences print('Generate sequences.') if add_error: seqs = [[nt2aa(add_random_error(seq[0],self.error_rate)), genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0]] for seq in [sg_model.gen_rnd_prod_CDR3(conserved_J_residues='ABCEDFGHIJKLMNOPQRSTUVWXYZ') for _ in tqdm(range(int(num_gen_seqs)))]] else: seqs = [[seq[1], genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0]] for seq in [sg_model.gen_rnd_prod_CDR3(conserved_J_residues='ABCEDFGHIJKLMNOPQRSTUVWXYZ') for _ in tqdm(range(int(num_gen_seqs)))]] if reset_gen_seqs: #reset gen_seqs if needed self.gen_seqs = [] #Add to specified pool(s) self.update_model(add_gen_seqs = seqs)
J_anchor_pos_file) #Load model generative_model = load_model.GenerativeModelVDJ() generative_model.load_and_process_igor_model(marginals_file_name) #Process model/data for pgen computation by instantiating GenerationProbabilityVDJ pgen_model = pgen.GenerationProbabilityVDJ(generative_model, genomic_data) #example #calculating pgen with restriction to V, J gene usage pgen_model.compute_aa_CDR3_pgen('CAWSVAPDRGGYTF', 'TRBV30*01', 'TRBJ1-2*01') #calculating pgen without restriction to V, J gene usage pgen_model.compute_aa_CDR3_pgen('CAWSVAPDRGGYTF') #Process model/data for sequence generation by instantiating SequenceGenerationVDJ seq_gen_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data) #Generate some random sequences seq_gen_model.gen_rnd_prod_CDR3() #('TGTGCCAGCAGTGAAAAAAGGCAATGGGAAAGCGGGGAGCTGTTTTTT', 'CASSEKRQWESGELFF', 27, 8) seq_gen_model.gen_rnd_prod_CDR3() #('TGTGCCAGCAGTTTAGTGGGAAGGGCGGGGCCCTATGGCTACACCTTC', 'CASSLVGRAGPYGYTF', 14, 1) seq_gen_model.gen_rnd_prod_CDR3() #('TGTGCCAGCTGGACAGGGGGCAACTACGAGCAGTACTTC', 'CASWTGGNYEQYF', 55, 13) #%% #genero 5000 secuencias y las guardo path_redes = '/home/heli/Documents/Redes/TPfinal/' rnd_seq = []