Esempio n. 1
0
    def gen_rnd_prod_CDR3(self, conserved_J_residues='FVW'):
        """Generate a productive CDR3 seq from a Monte Carlo draw of the model.

        Parameters
        ----------
        conserved_J_residues : str, optional
            Conserved amino acid residues defining the CDR3 on the J side (normally
            F, V, and/or W)

        Returns
        -------
        ntseq : str
            Productive CDR3 nucleotide sequence
        aaseq : str
            CDR3 amino acid sequence (aaseq = nt2aa(ntseq))
        V_choice : int
            Index of V allele chosen to generate the CDR3 seq
        J_choice : int
            Index of J allele chosen to generate the CDR3 seq

        """

        coding_pass = False

        while ~coding_pass:
            recomb_events = self.choose_random_recomb_events()
            V_seq = self.cutV_genomic_CDR3_segs[recomb_events['V']]

            #This both checks that the position of the conserved C is
            #identified and that the V isn't fully deleted out of the CDR3
            #region
            if len(V_seq) <= max(recomb_events['delV'], 0):
                continue
            J_seq = self.cutJ_genomic_CDR3_segs[recomb_events['J']]

            #We check that J isn't deleted more than allowed. Note the
            #generative model really should reflect this structure already
            if len(J_seq) < recomb_events['delJ']:
                continue

            V_seq = V_seq[:len(V_seq) - recomb_events['delV']]
            J_seq = J_seq[recomb_events['delJ']:]

            if (len(V_seq) + len(J_seq) + recomb_events['insVJ']) % 3 != 0:
                continue

            insVJ_seq = rnd_ins_seq(recomb_events['insVJ'], self.C_Rvj,
                                    self.C_first_nt_bias_insVJ)

            #Translate to amino acid sequence, see if productive
            ntseq = V_seq + insVJ_seq + J_seq
            aaseq = nt2aa(ntseq)

            if '*' not in aaseq and aaseq[0] == 'C' and aaseq[
                    -1] in conserved_J_residues:
                return ntseq, aaseq, recomb_events['V'], recomb_events['J']
Esempio n. 2
0
    def add_generated_seqs(self, num_gen_seqs = 0, reset_gen_seqs = True, custom_model_folder = None, add_error=False,custom_error=None):
        """Generates MonteCarlo sequences for gen_seqs using OLGA.

        Only generates seqs from a V(D)J model. Requires the OLGA package
        (pip install olga).

        Parameters
        ----------
        num_gen_seqs : int or float
            Number of MonteCarlo sequences to generate and add to the specified
            sequence pool.
        custom_model_folder : str
            Path to a folder specifying a custom IGoR formatted model to be
            used as a generative model. Folder must contain 'model_params.txt'
            and 'model_marginals.txt'
        add_error: bool
            simualate sequencing error: default is false
        custom_error: int
            set custom error rate for sequencing error.
            Default is the one inferred by igor.

        Attributes set
        --------------
        gen_seqs : list
            MonteCarlo sequences drawn from a VDJ recomb model
        gen_seq_features : list
            Features gen_seqs have been projected onto.

        """
        from sonia.utils import add_random_error
        from olga.utils import nt2aa

        #Load generative model
        if custom_model_folder is None:
            try:
                if self.custom_pgen_model is None: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.chain_type)
                else: main_folder=self.custom_pgen_model
            except:
                main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.chain_type)
        else:
            main_folder = custom_model_folder

        params_file_name = os.path.join(main_folder,'model_params.txt')
        marginals_file_name = os.path.join(main_folder,'model_marginals.txt')
        V_anchor_pos_file = os.path.join(main_folder,'V_gene_CDR3_anchors.csv')
        J_anchor_pos_file = os.path.join(main_folder,'J_gene_CDR3_anchors.csv')

        if not os.path.isfile(params_file_name) or not os.path.isfile(marginals_file_name):
            print('Cannot find specified custom generative model files: ' + '\n' + params_file_name + '\n' + marginals_file_name)
            print('Exiting sequence generation...')
            return None
        if not os.path.isfile(V_anchor_pos_file):
            V_anchor_pos_file = os.path.join(os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'V_gene_CDR3_anchors.csv')
        if not os.path.isfile(J_anchor_pos_file):
            J_anchor_pos_file = os.path.join(os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'J_gene_CDR3_anchors.csv')

        with open(params_file_name,'r') as file:
            sep=0
            error_rate=''
            lines=file.read().splitlines()
            while len(error_rate)<1:
                error_rate=lines[-1+sep]
                sep-=1

        if custom_error is None: self.error_rate=float(error_rate)
        else: self.error_rate=custom_error

        if self.vj:
            genomic_data = olga_load_model.GenomicDataVJ()
            genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
            generative_model = olga_load_model.GenerativeModelVJ()
            generative_model.load_and_process_igor_model(marginals_file_name)
            sg_model = seq_gen.SequenceGenerationVJ(generative_model, genomic_data)
        else:
            genomic_data = olga_load_model.GenomicDataVDJ()
            genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
            generative_model = olga_load_model.GenerativeModelVDJ()
            generative_model.load_and_process_igor_model(marginals_file_name)
            sg_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data)

        #Generate sequences
        print('Generate sequences.')
        if add_error: seqs = [[nt2aa(add_random_error(seq[0],self.error_rate)), genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0]] for seq in [sg_model.gen_rnd_prod_CDR3(conserved_J_residues='ABCEDFGHIJKLMNOPQRSTUVWXYZ') for _ in tqdm(range(int(num_gen_seqs)))]]
        else: seqs = [[seq[1], genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0]] for seq in [sg_model.gen_rnd_prod_CDR3(conserved_J_residues='ABCEDFGHIJKLMNOPQRSTUVWXYZ') for _ in tqdm(range(int(num_gen_seqs)))]]
        if reset_gen_seqs: #reset gen_seqs if needed
            self.gen_seqs = []
        #Add to specified pool(s)
        self.update_model(add_gen_seqs = seqs)
Esempio n. 3
0
def main():
    """Compute Pgens from a file and output to another file."""

    parser = OptionParser(conflict_handler="resolve")

    parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)')
    parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)')
    parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)')
    parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)')
    parser.add_option('--set_custom_model_VDJ', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model')
    parser.add_option('--set_custom_model_VJ', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model')

    parser.add_option('-i', '--infile', dest = 'infile_name',metavar='PATH/TO/FILE', help='read in CDR3 sequences (and optionally V/J masks) from PATH/TO/FILE')
    parser.add_option('-o', '--outfile', dest = 'outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences and pgens to PATH/TO/FILE')
    parser.add_option('--seq_in', '--seq_index', type='int', metavar='INDEX', dest='seq_in_index', default = 0, help='specifies sequences to be read in are in column INDEX. Default is index 0 (the first column).')

    parser.add_option('--v_in', '--v_mask_index', type='int', metavar='INDEX', dest='V_mask_index', help='specifies V_masks are found in column INDEX in the input file. Default is no V mask.')
    parser.add_option('--j_in', '--j_mask_index', type='int', metavar='INDEX', dest='J_mask_index', help='specifies J_masks are found in column INDEX in the input file. Default is no J mask.')

    parser.add_option('--v_mask', type='string', dest='V_mask', help='specify V usage to condition Pgen on for seqs read in as arguments.')
    parser.add_option('--j_mask', type='string', dest='J_mask', help='specify J usage to condition Pgen on for seqs read in as arguments.')

    parser.add_option('-m', '--max_number_of_seqs', type='int',metavar='N', dest='max_number_of_seqs', help='compute Pgens for at most N sequences.')
    parser.add_option('--lines_to_skip', type='int',metavar='N', dest='lines_to_skip', default = 0, help='skip the first N lines of the file. Default is 0.')
    parser.add_option('-a', '--alphabet_filename', dest='alphabet_filename', metavar='PATH/TO/FILE', help="specify PATH/TO/FILE defining a custom 'amino acid' alphabet. Default is no custom alphabet.")
    parser.add_option('--seq_type_out', type='choice',metavar='SEQ_TYPE', dest='seq_type_out',  choices=['all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'], help="if read in sequences are ntseqs, declare what type of sequence to compute pgen for. Default is all. Choices: 'all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'")
    parser.add_option('--skip_off','--skip_empty_off', action='store_true', dest = 'skip_empty', default=True, help='stop skipping empty or blank sequences/lines (if for example you want to keep line index fidelity between the infile and outfile).')

    parser.add_option('--display_off', action='store_false', dest='display_seqs', default=True, help='turn the sequence display off (only applies in write-to-file mode). Default is on.')
    parser.add_option('--num_lines_for_display', type='int', metavar='N', default = 50, dest='num_lines_for_display', help='N lines of the output file are displayed when sequence display is on. Also used to determine the number of sequences to average over for speed and time estimates.')
    parser.add_option('--time_updates_off', action='store_false', dest='time_updates', default=True, help='turn time updates off (only applies when sequence display is disabled).')
    parser.add_option('--seqs_per_time_update', type='float', metavar='N', default = 100, dest='seqs_per_time_update', help='specify the number of sequences between time updates. Default is 1e5.')

    parser.add_option('-d', '--delimiter', type='choice', dest='delimiter',  choices=['tab', 'space', ',', ';', ':'], help="declare infile delimiter. Default is tab for .tsv input files, comma for .csv files, and any whitespace for all others. Choices: 'tab', 'space', ',', ';', ':'")
    parser.add_option('--raw_delimiter', type='str', dest='delimiter', help="declare infile delimiter as a raw string.")
    parser.add_option('--delimiter_out', type='choice', dest='delimiter_out',  choices=['tab', 'space', ',', ';', ':'], help="declare outfile delimiter. Default is tab for .tsv output files, comma for .csv files, and the infile delimiter for all others. Choices: 'tab', 'space', ',', ';', ':'")
    parser.add_option('--raw_delimiter_out', type='str', dest='delimiter_out', help="declare for the delimiter outfile as a raw string.")
    parser.add_option('--gene_mask_delimiter', type='choice', dest='gene_mask_delimiter',  choices=['tab', 'space', ',', ';', ':'], help="declare gene mask delimiter. Default comma unless infile delimiter is comma, then default is a semicolon. Choices: 'tab', 'space', ',', ';', ':'")
    parser.add_option('--raw_gene_mask_delimiter', type='str', dest='gene_mask_delimiter', help="declare delimiter of gene masks as a raw string.")
    parser.add_option('--comment_delimiter', type='str', dest='comment_delimiter', help="character or string to indicate comment or header lines to skip.")


    (options, args) = parser.parse_args()

    #Check that the model is specified properly
    main_folder = os.path.dirname(__file__)

    default_models = {}
    default_models['humanTRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'),  'VJ']
    default_models['humanTRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ']
    default_models['mouseTRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ']
    default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ']

    num_models_specified = sum([1 for x in default_models.keys() + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)])

    if num_models_specified == 1: #exactly one model specified
        try:
            d_model = [x for x in default_models.keys() if getattr(options, x)][0]
            model_folder = default_models[d_model][0]
            recomb_type = default_models[d_model][1]
        except IndexError:
            if options.vdj_model_folder: #custom VDJ model specified
                model_folder = options.vdj_model_folder
                recomb_type = 'VDJ'
            elif options.vj_model_folder: #custom VJ model specified
                model_folder = options.vj_model_folder
                recomb_type = 'VJ'
    elif num_models_specified == 0:
        print 'Need to indicate generative model.'
        print 'Exiting...'
        return -1
    elif num_models_specified > 1:
        print 'Only specify one model'
        print 'Exiting...'
        return -1

    #Check that all model and genomic files exist in the indicated model folder
    if not os.path.isdir(model_folder):
        print 'Check pathing... cannot find the model folder: ' + model_folder
        print 'Exiting...'
        return -1

    params_file_name = os.path.join(model_folder,'model_params.txt')
    marginals_file_name = os.path.join(model_folder,'model_marginals.txt')
    V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv')
    J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv')

    for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]:
        if not os.path.isfile(x):
            print 'Cannot find: ' + x
            print 'Please check the files (and naming conventions) in the model folder ' + model_folder
            print 'Exiting...'
            return -1

    alphabet_filename = options.alphabet_filename #used if a custom alphabet is to be specified
    if alphabet_filename is not None:
        if not os.path.isfile(alphabet_filename):
            print 'Cannot find custom alphabet file: ' + infile_name
            print 'Exiting...'
            return -1

    #Load up model based on recomb_type
    #VDJ recomb case --- used for TCRB and IGH
    if recomb_type == 'VDJ':
        genomic_data = load_model.GenomicDataVDJ()
        genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
        generative_model = load_model.GenerativeModelVDJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        pgen_model = generation_probability.GenerationProbabilityVDJ(generative_model, genomic_data, alphabet_filename)
    #VJ recomb case --- used for TCRA and light chain
    elif recomb_type == 'VJ':
        genomic_data = load_model.GenomicDataVJ()
        genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
        generative_model = load_model.GenerativeModelVJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        pgen_model = generation_probability.GenerationProbabilityVJ(generative_model, genomic_data, alphabet_filename)

    aa_alphabet = ''.join(pgen_model.codons_dict.keys())

    if options.infile_name is not None:
        infile_name = options.infile_name

        if not os.path.isfile(infile_name):
            print 'Cannot find input file: ' + infile_name
            print 'Exiting...'
            return -1

    if options.outfile_name is not None:
        outfile_name = options.outfile_name
        if os.path.isfile(outfile_name):
            if not raw_input(outfile_name + ' already exists. Overwrite (y/n)? ').strip().lower() in ['y', 'yes']:
                print 'Exiting...'
                return -1

    #Parse delimiter
    delimiter = options.delimiter
    if delimiter is None: #Default case
        if options.infile_name is None:
            delimiter = '\t'
        elif infile_name.endswith('.tsv'): #parse TAB separated value file
            delimiter = '\t'
        elif infile_name.endswith('.csv'): #parse COMMA separated value file
            delimiter = ','
    else:
        try:
            delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter]
        except KeyError:
            pass #Other string passed as the delimiter.

    #Parse delimiter_out
    delimiter_out = options.delimiter_out
    if delimiter_out is None: #Default case
        if delimiter is None:
            delimiter_out = '\t'
        else:
            delimiter_out = delimiter
        if options.outfile_name is None:
            pass
        elif outfile_name.endswith('.tsv'): #output TAB separated value file
            delimiter_out = '\t'
        elif outfile_name.endswith('.csv'): #output COMMA separated value file
            delimiter_out = ','
    else:
        try:
            delimiter_out = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter_out]
        except KeyError:
            pass #Other string passed as the delimiter.

    #Parse gene_delimiter
    gene_mask_delimiter = options.gene_mask_delimiter
    if gene_mask_delimiter is None: #Default case
        gene_mask_delimiter = ','
        if delimiter == ',':
            gene_mask_delimiter = ';'
    else:
        try:
            gene_mask_delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[gene_mask_delimiter]
        except KeyError:
            pass #Other string passed as the delimiter.


    #More options
    time_updates = options.time_updates
    display_seqs = options.display_seqs
    num_lines_for_display = options.num_lines_for_display
    seq_in_index = options.seq_in_index #where in the line the sequence is after line.split(delimiter)
    lines_to_skip = options.lines_to_skip #one method of skipping header
    comment_delimiter = options.comment_delimiter #another method of skipping header
    seqs_per_time_update = options.seqs_per_time_update
    max_number_of_seqs = options.max_number_of_seqs
    V_mask_index = options.V_mask_index #Default is not conditioning on V identity
    J_mask_index = options.J_mask_index #Default is not conditioning on J identity
    skip_empty = options.skip_empty

    seq_type_out = options.seq_type_out #type of pgens to be computed. Can be ntseq, aaseq, or both
    if seq_type_out is not None:
        seq_type_out = {'all': None, 'ntseq': 'ntseq', 'nucleotide': 'ntseq', 'aaseq': 'aaseq', 'amino_acid': 'aaseq'}[seq_type_out]

    if options.infile_name is None: #No infile specified -- args should be the input seqs
        print_warnings = True
        seqs = args
        seq_types = [determine_seq_type(seq, aa_alphabet) for seq in seqs]
        unrecognized_seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is None]
        if len(unrecognized_seqs) > 0 and print_warnings:
            print 'The following sequences/arguments were not recognized: ' + ', '.join(unrecognized_seqs)
        seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is not None]
        seq_types = [seq_type for seq_type in seq_types if seq_type is not None]


        #Format V and J masks -- uniform for all argument input sequences
        try:
            V_mask = options.V_mask.split(',')
            unrecognized_v_genes = [v for v in V_mask if v not in pgen_model.V_mask_mapping.keys()]
            V_mask = [v for v in V_mask if v in pgen_model.V_mask_mapping.keys()]
            if len(unrecognized_v_genes) > 0:
                print 'These V genes/alleles are not recognized: ' + ', '.join(unrecognized_v_genes)
            if len(V_mask) == 0:
                print 'No recognized V genes/alleles in the provided V_mask. Continuing without conditioning on V usage.'
                V_mask = None
        except AttributeError:
            V_mask = options.V_mask #Default is None, i.e. not conditioning on V identity

        try:
            J_mask = options.J_mask.split(',')
            unrecognized_j_genes = [j for j in J_mask if j not in pgen_model.J_mask_mapping.keys()]
            J_mask = [j for j in J_mask if j in pgen_model.J_mask_mapping.keys()]
            if len(unrecognized_j_genes) > 0:
                print 'These J genes/alleles are not recognized: ' + ', '.join(unrecognized_j_genes)
            if len(J_mask) == 0:
                print 'No recognized J genes/alleles in the provided J_mask. Continuing without conditioning on J usage.'
                J_mask = None
        except AttributeError:
            J_mask = options.J_mask #Default is None, i.e. not conditioning on J identity

        print ''
        start_time = time.time()
        for seq, seq_type in zip(seqs, seq_types):
            if seq_type == 'aaseq':
                c_pgen = pgen_model.compute_aa_CDR3_pgen(seq, V_mask, J_mask, print_warnings)
                print 'Pgen of the amino acid sequence ' + seq + ': ' + str(c_pgen)
                print ''
            elif seq_type == 'regex':
                c_pgen = pgen_model.compute_regex_CDR3_template_pgen(seq, V_mask, J_mask, print_warnings)
                print 'Pgen of the regular expression sequence ' + seq + ': ' + str(c_pgen)
                print ''
            elif seq_type == 'ntseq':
                if seq_type_out is None or seq_type_out == 'ntseq':
                    c_pgen_nt = pgen_model.compute_nt_CDR3_pgen(seq, V_mask, J_mask, print_warnings)
                    print 'Pgen of the nucleotide sequence ' + seq + ': ' + str(c_pgen_nt)
                if seq_type_out is None or seq_type_out == 'aaseq':
                    c_pgen_aa = pgen_model.compute_aa_CDR3_pgen(nt2aa(seq), V_mask, J_mask, print_warnings)
                    print 'Pgen of the amino acid sequence nt2aa(' + seq + ') = ' + nt2aa(seq) + ': ' + str(c_pgen_aa)
                print ''

        c_time = time.time() - start_time
        if c_time > 86400: #more than a day
            c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60)
        elif c_time > 3600: #more than an hr
            c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60)
        elif c_time > 60: #more than a min
            c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60)
        else:
            c_time_str = '%.2f seconds.'%(c_time)

        print 'Completed pgen computation in: ' + c_time_str

    else: #Read sequences in from file
        print_warnings = False #Most cases of reading in from file should have warnings disabled
        seqs = []
        seq_types = []
        V_usage_masks = []
        J_usage_masks = []

        infile = open(infile_name, 'r')

        for i, line in enumerate(infile):
            if comment_delimiter is not None: #Default case -- no comments/header delimiter
                if line.startswith(comment_delimiter): #allow comments
                    continue
            if i < lines_to_skip:
                continue

            if delimiter is None: #Default delimiter is any whitespace
                split_line = line.split()
            else:
                split_line = line.split(delimiter)

            #Find the seq
            try:
                seq = split_line[seq_in_index].strip()
                if len(seq.strip()) == 0:
                    if skip_empty:
                        continue
                    else:
                        seqs.append(seq) #keep the blank seq as a placeholder
                        seq_types.append('aaseq')
                else:
                    seqs.append(seq)
                    seq_types.append(determine_seq_type(seq, aa_alphabet))
            except IndexError: #no index match for seq
                if skip_empty and len(line.strip()) == 0:
                    continue
                print 'seq_in_index is out of range'
                print 'Exiting...'
                infile.close()
                return -1

            #Find and format V_usage_mask
            if V_mask_index is None:
                V_usage_masks.append(None) #default mask
            else:
                try:
                    V_usage_mask = split_line[V_mask_index].strip().split(gene_mask_delimiter)
                    #check that all V gene/allele names are recognized
                    if all([v in pgen_model.V_mask_mapping for v in V_usage_mask]):
                        V_usage_masks.append(V_usage_mask)
                    else:
                        print str(V_usage_mask) + " is not a usable V_usage_mask composed exclusively of recognized V gene/allele names"
                        print 'Unrecognized V gene/allele names: ' + ', '.join([v for v in V_usage_mask if not v in pgen_model.V_mask_mapping.keys()])
                        print 'Exiting...'
                        infile.close()
                        return -1
                except IndexError: #no index match for V_mask_index
                    print 'V_mask_index is out of range'
                    print 'Exiting...'
                    infile.close()
                    return -1

            #Find and format J_usage_mask
            if J_mask_index is None:
                J_usage_masks.append(None) #default mask
            else:
                try:
                    J_usage_mask = split_line[J_mask_index].strip().split(gene_mask_delimiter)
                    #check that all V gene/allele names are recognized
                    if all([j in pgen_model.J_mask_mapping for j in J_usage_mask]):
                        J_usage_masks.append(J_usage_mask)
                    else:
                        print str(J_usage_mask) + " is not a usable J_usage_mask composed exclusively of recognized J gene/allele names"
                        print 'Unrecognized J gene/allele names: ' + ', '.join([j for j in J_usage_mask if not j in pgen_model.J_mask_mapping.keys()])
                        print 'Exiting...'
                        infile.close()
                        return -1
                except IndexError: #no index match for J_mask_index
                    print 'J_mask_index is out of range'
                    print 'Exiting...'
                    infile.close()
                    return -1

            if max_number_of_seqs is not None:
                if len(seqs) >= max_number_of_seqs:
                    break


        unrecognized_seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is None]
        if len(unrecognized_seqs) > 0 and len(unrecognized_seqs) < len(seqs):
            if print_warnings or options.outfile_name is not None:
                print 'Some strings read in were not parsed as sequences -- they will be omitted.'
                print 'Examples of improperly read strings: '
                for unrecognized_seq in unrecognized_seqs[:10]:
                    print unrecognized_seq
            seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is not None]
            V_usage_masks = [V_usage_mask for i, V_usage_mask in enumerate(V_usage_masks) if seq_types[i] is not None]
            seq_types = [seq_type for seq_type in seq_types if seq_type is not None]
        elif len(unrecognized_seqs) > 0 and len(unrecognized_seqs) == len(seqs):
            print 'None of the read in strings were parsed as sequences. Check input file.'
            print 'Examples of improperly read strings:'
            for unrecognized_seq in unrecognized_seqs[:10]:
                print unrecognized_seq
            print 'Exiting...'
            return -1

        infile.close()


        if options.outfile_name is not None: #OUTFILE SPECIFIED, allow printed info/display

            print 'Successfully read in and formatted ' + str(len(seqs)) + ' sequences and any V or J usages.'
            if display_seqs:
                sys.stdout.write('\r'+'Continuing to Pgen computation in 3... ')
                sys.stdout.flush()
                time.sleep(0.4)
                sys.stdout.write('\r'+'Continuing to Pgen computation in 2... ')
                sys.stdout.flush()
                time.sleep(0.4)
                sys.stdout.write('\r'+'Continuing to Pgen computation in 1... ')
                sys.stdout.flush()
                time.sleep(0.4)
            else:
                print 'Continuing to Pgen computation.'
                print_warnings = True #Display is off, can print warnings

            if display_seqs:
                lines_for_display = []
                times_for_speed_calc = [time.time()]

            outfile = open(outfile_name, 'w')
            start_time = time.time()
            for i, seq in enumerate(seqs):
                if seq_types[i] == 'aaseq':
                    #Compute Pgen and print out
                    c_pgen_line = seq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                if seq_types[i] == 'regex':
                    #Compute Pgen and print out
                    c_pgen_line = seq + delimiter_out + str(pgen_model.compute_regex_CDR3_template_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                elif seq_types[i] == 'ntseq':
                    ntseq = seq
                    if len(ntseq) % 3 == 0: #inframe sequence
                        aaseq = nt2aa(ntseq)
                        #Compute Pgen and print out based on recomb_type and seq_type_out
                        if seq_type_out is None:
                            c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + delimiter_out + aaseq + delimiter_out +  str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                        elif seq_type_out == 'ntseq':
                            c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                        elif seq_type_out == 'aaseq':
                            c_pgen_line = aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                    else: #out of frame sequence -- Pgens are 0 and use 'out_of_frame' for aaseq
                        if seq_type_out is None:
                            c_pgen_line = ntseq + delimiter_out + '0' + delimiter_out + 'out_of_frame' + delimiter_out + '0'
                        elif seq_type_out == 'ntseq':
                            c_pgen_line = ntseq + delimiter_out + '0'
                        elif seq_type_out == 'aaseq':
                            c_pgen_line = 'out_of_frame' + delimiter_out + '0'

                outfile.write(c_pgen_line + '\n')

                #Print time update
                if display_seqs:
                    cc_time = time.time()
                    c_time = cc_time - start_time
                    times_for_speed_calc = [cc_time] + times_for_speed_calc[:num_lines_for_display]
                    c_avg_speed = (len(times_for_speed_calc)-1)/float(times_for_speed_calc[0] - times_for_speed_calc[-1])

                    #eta = ((len(seqs) - (i+1))/float(i+1))*c_time

                    eta = (len(seqs) - (i+1))/c_avg_speed

                    lines_for_display = [c_pgen_line] + lines_for_display[:num_lines_for_display]


                    c_time_str = '%s hours, %s minutes, and %s seconds.'%(repr(int(c_time)/3600).rjust(3), repr((int(c_time)/60)%60).rjust(2), repr(int(c_time)%60).rjust(2))
                    eta_str = '%s hours, %s minutes, and %s seconds.'%(repr(int(eta)/3600).rjust(3), repr((int(eta)/60)%60).rjust(2), repr(int(eta)%60).rjust(2))
                    time_str = 'Time to compute Pgen on %s seqs: %s \nEst. time for remaining %s seqs: %s'%(repr(i+1).rjust(9), c_time_str, repr(len(seqs) - (i + 1)).rjust(9), eta_str)
                    speed_str = 'Current Pgen computation speed: %s seqs/min'%(repr(round((len(times_for_speed_calc)-1)*60/float(times_for_speed_calc[0] - times_for_speed_calc[-1]), 2)).rjust(8))
                    display_str = '\n'.join(lines_for_display[::-1]) + '\n' + '-'*80 + '\n' + time_str + '\n' + speed_str + '\n' + '-'*80
                    print '\033[2J' + display_str
                elif (i+1)%seqs_per_time_update == 0 and time_updates:
                    c_time = time.time() - start_time
                    eta = ((len(seqs) - (i+1))/float(i+1))*c_time
                    if c_time > 86400: #more than a day
                        c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60)
                    elif c_time > 3600: #more than an hr
                        c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60)
                    elif c_time > 60: #more than a min
                        c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60)
                    else:
                        c_time_str = '%.2f seconds.'%(c_time)

                    if eta > 86400: #more than a day
                        eta_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(eta)/86400, (int(eta)/3600)%24, (int(eta)/60)%60, eta%60)
                    elif eta > 3600: #more than an hr
                        eta_str = '%d hours, %d minutes, and %.2f seconds.'%((int(eta)/3600)%24, (int(eta)/60)%60, eta%60)
                    elif eta > 60: #more than a min
                        eta_str = '%d minutes and %.2f seconds.'%((int(eta)/60)%60, eta%60)
                    else:
                        eta_str = '%.2f seconds.'%(eta)

                    print 'Pgen computed for %d sequences in: %s Estimated time remaining: %s'%(i+1, c_time_str, eta_str)

            c_time = time.time() - start_time
            if c_time > 86400: #more than a day
                c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60)
            elif c_time > 3600: #more than an hr
                c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60)
            elif c_time > 60: #more than a min
                c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60)
            else:
                c_time_str = '%.2f seconds.'%(c_time)
            print 'Completed Pgen computation for %d sequences: in %s'%(len(seqs), c_time_str)

            outfile.close()

        else: #NO OUTFILE -- print directly to stdout
            start_time = time.time()
            for i, seq in enumerate(seqs):
                if seq_types[i] == 'aaseq':
                    #Compute Pgen and print out
                    c_pgen_line = seq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                if seq_types[i] == 'regex':
                    #Compute Pgen and print out
                    c_pgen_line = seq + delimiter_out + str(pgen_model.compute_regex_CDR3_template_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                elif seq_types[i] == 'ntseq':
                    ntseq = seq
                    if len(ntseq) % 3 == 0: #inframe sequence
                        aaseq = nt2aa(ntseq)
                        #Compute Pgen and print out based on recomb_type and seq_type_out
                        if seq_type_out is None:
                            c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + delimiter_out + aaseq + delimiter_out +  str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                        elif seq_type_out == 'ntseq':
                            c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                        elif seq_type_out == 'aaseq':
                            c_pgen_line = aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                    else: #out of frame sequence -- Pgens are 0 and use 'out_of_frame' for aaseq
                        if seq_type_out is None:
                            c_pgen_line = ntseq + delimiter_out + '0' + delimiter_out + 'out_of_frame' + delimiter_out + '0'
                        elif seq_type_out == 'ntseq':
                            c_pgen_line = ntseq + delimiter_out + '0'
                        elif seq_type_out == 'aaseq':
                            c_pgen_line = 'out_of_frame' + delimiter_out + '0'

                print c_pgen_line
Esempio n. 4
0
    def gen_rnd_prod_CDR3(
        self,
        V=None,
        J=None,
        conserved_J_residues='FVW',
    ):
        """Generate a productive CDR3 seq from a Monte Carlo draw of the model.

        Parameters
        ----------
        conserved_J_residues : str, optional
            Conserved amino acid residues defining the CDR3 on the J side (normally
            F, V, and/or W)

        Returns
        -------
        ntseq : str
            Productive CDR3 nucleotide sequence
        aaseq : str
            CDR3 amino acid sequence (aaseq = nt2aa(ntseq))
        V_choice : int
            Index of V allele chosen to generate the CDR3 seq
        J_choice : int
            Index of J allele chosen to generate the CDR3 seq

        """

        coding_pass = False
        counter = 0
        while ~coding_pass and counter < 20:
            counter = counter + 1
            #print(counter)
            if V is not None:
                recomb_events = self.choose_directed_recomb_events(V=V, J=J)
            #    print(recomb_events)
            else:
                recomb_events = self.choose_random_recomb_events()
            #print(recomb_events)

            V_seq = self.cutV_genomic_CDR3_segs[recomb_events['V']]
            #print(V_seq)
            #This both checks that the position of the conserved C is
            #identified and that the V isn't fully deleted out of the CDR3
            #region
            if len(V_seq) <= max(recomb_events['delV'], 0):
                continue

            D_seq = self.cutD_genomic_CDR3_segs[recomb_events['D']]
            J_seq = self.cutJ_genomic_CDR3_segs[recomb_events['J']]

            #We check that the D and J aren't deleted more than allowed. Note
            #the generative model really should reflect this structure already
            if len(D_seq) < (recomb_events['delDl'] + recomb_events['delDr']
                             ) or len(J_seq) < recomb_events['delJ']:
                continue

            V_seq = V_seq[:len(V_seq) - recomb_events['delV']]
            D_seq = D_seq[recomb_events['delDl']:len(D_seq) -
                          recomb_events['delDr']]
            J_seq = J_seq[recomb_events['delJ']:]

            if (len(V_seq) + len(D_seq) + len(J_seq) + recomb_events['insVD'] +
                    recomb_events['insDJ']) % 3 != 0:
                continue

            insVD_seq = rnd_ins_seq(recomb_events['insVD'], self.C_Rvd,
                                    self.C_first_nt_bias_insVD)
            insDJ_seq = rnd_ins_seq(
                recomb_events['insDJ'], self.C_Rdj,
                self.C_first_nt_bias_insDJ)[::-1]  #have to reverse the DJ seq

            #Translate to amino acid sequence, see if productive
            ntseq = V_seq + insVD_seq + D_seq + insDJ_seq + J_seq
            aaseq = nt2aa(ntseq)

            if '*' not in aaseq and aaseq[0] == 'C' and aaseq[
                    -1] in conserved_J_residues:
                return ntseq, aaseq, recomb_events['V'], recomb_events[
                    'J'], recomb_events

        #warnings.warn(f"After {counter} attemps no productive CDR3 found from V:{V} and J:{J}, delV likely exceeds V_seq: '{V_seq}' see possible issue with cutV_genomic_CDR3_segs () ")
        return None