예제 #1
0
    def generate_multi_mut_peptide(self):
        max_mut = len(self.peptVarDict)
        var_sets = []
        for i in range(max_mut):
            ids = list(itertools.combinations(self.peptVarDict.keys(), i + 1))
            for idx in ids:
                candidates = [self.peptVarDict[key] for key in idx]
                var_sets.extend(list(itertools.product(*candidates)))

        self.mut_pept = {}
        for var_set in var_sets:
            var_set_list = list(var_set)
            var_set_list.sort(key=lambda x: x.pos)
            var1_set = [var.var1 for var in var_set_list]
            shifted_var_set = [
                variant.shift_var(var.var1, self.start_pos)
                for var in var_set_list
            ]
            shifted_var_set = [
                variant(self.pepID, var) for var in shifted_var_set
            ]
            # shifted_var_set.sort(key = lambda x:x.pos) # sort variants based on ascending positions
            self.mut_pept['_'.join(var1_set)] = self.multi_mutate(
                shifted_var_set)
        return
예제 #2
0
 def generate_single_mut_peptide(self):
     self.mut_pept = {}
     for var in self.peptVariants:
         shifted_var = variant.shift_var(var.var1, self.start_pos)
         shifted_var = variant(self.pepID, shifted_var)
         self.mut_pept[var.var1] = protein.mutate(self.seq, shifted_var)
     return
예제 #3
0
 def find_variants(self, variants, refSeq):
     protVariants = variants[self.transcript]
     self.peptVariants = []
     for eachVar in protVariants:
         var = variant(self.transcript, eachVar)
         var.get_seq(refSeq)
         if self.start_pos <= var.pos <= self.start_pos + self.length - 1:
             if not var.is_RKP():
                 #self.peptVariants.append(var.var3)
                 self.peptVariants.append(var)
     return
def build_protDB(refSeq_file, variants_file, output_dir, mut_only):

    variants_read = cfunc.read_cmi_var(variants_file)
    refSeq = cfunc.read_uniprotFa(refSeq_file)

    # filter variants according to refSeq.
    variants = {}
    for key, var in variants_read.iteritems():
        if key in refSeq.keys():
            variants[key] = var

    # Annotate proteins
    print 'Annotating proteins and mutants...'
    prot_profile = []
    len_protein = 0
    len_mutant = 0
    for key in variants.keys():
        eachProtein = refSeq[key]
        protein1 = protein(eachProtein['header'], eachProtein['seq'])
        prot_profile.append(protein1)
        len_protein += 1
        matchVariants = variants[protein1.transcript]
        for var in matchVariants:
            mprotein = protein(eachProtein['header'], eachProtein['seq'])
            variant1 = variant(protein1.transcript, var)
            mprotein.add_mutation(variant1)
            prot_profile.append(mprotein)
            len_mutant += 1
    len_profile = len(prot_profile)
    print 'Finished %d reference proteins.' % (len_protein)
    print 'Finished %d mutant proteins.' % (len_mutant)
    print 'Annotated %d peptides.\n' % (len_profile)

    if mut_only == 1:
        print "Export only mutant proteins..."
        prot_profile = [
            prot for prot in prot_profile if prot.mutation != 'REF'
        ]

    records = []
    for prot in prot_profile:
        records.append(cfunc.protein_to_SeqRecord(prot))
    now = strftime("%Y%m%d-%H%M%S", localtime())
    out_db_file = output_dir + "mutProteinDB_" + now + ".fa"
    SeqIO.write(records, out_db_file, 'fasta')
    return
def build_pepDB(refSeq_file, variants_file, output_dir, mut_only, max_miss,
                min_len, max_len):

    #max_miss = 2
    #min_len = 6
    #max_len = 144

    variants_read = cfunc.read_cmi_var(variants_file)
    refSeq_read = cfunc.read_uniprotFa(refSeq_file)

    # filter variants to focus on the ones only associated with the longest transcript
    refSeq = refSeq_read
    # variants = variants_read
    # refSeq = cfunc.get_longest_records(refSeq)
    variants = {}
    for key, var in variants_read.iteritems():
        if key in refSeq.keys():
            variants[key] = var

    # obtain trypsinized peptide profiles considering mutations in R, K, P that affect trypsin digestion
    print 'Annotating reference peptides...'
    pep_profile = []
    len_protein = 0
    for key in variants.keys():
        eachProtein = refSeq[key]
        protein1 = protein(eachProtein['header'], eachProtein['seq'])
        protein1.add_trypsin_profile(max_miss=max_miss)
        protein1.annotate_pep()
        matchVariants = variants[protein1.transcript]
        for var in matchVariants:
            variant1 = variant(protein1.transcript, var)
            variant1.get_seq(refSeq)
            if variant1.is_RKP(
            ):  # check whether mutation will affect trypsin digestion
                protein1.annotate_pepvar(variant1, max_miss=max_miss)
        protein1.exclude_len(min_len=min_len, max_len=max_len)
        pep_profile.extend(protein1.trypsin_profile)

        len_protein += 1
        len_profile = len(pep_profile)
        if len_protein % 100 == 0:
            print 'Finished %d reference proteins.' % (len_protein)
            print 'Annotated %d reference peptides.' % (len_profile)
    print '\nFinished %d reference proteins.' % (len_protein)
    print 'Annotated %d reference peptides.\n' % (len_profile)

    # generate all peptides with single mutations at trypsinized peptide level
    print 'Annotating mutant peptides...'
    mut_pept = []
    len_peptide = 0
    for pep in pep_profile:
        peptide1 = peptide(pep)
        if peptide1.mutation == 'REF':  # exclude mutants that affect trypsin digestion
            peptide1.find_variants(variants, refSeq)
            peptide1.generate_single_mut_peptide()
            peptide1.annotate()
            peptide1.exclude_len(min_len=min_len, max_len=max_len)
            mut_pept.extend(peptide1.annotated_profile)

        len_peptide += 1
        len_profile = len(mut_pept)
        if len_peptide % 10000 == 0:
            print 'Finished %d reference peptides.' % (len_peptide)
            print 'Annotated %d mutations.' % (len_profile)
    print '\nFinished %d reference peptides.' % (len_peptide)
    print 'Annotated %d mutations.\n' % (len_profile)

    # remove duplicates due to the same nonsense mutation on different miss cleavage yielded peptides
    print 'Removing replicates...'
    uniqIDs = set()
    uniq_mut_pept = []
    for pep in mut_pept:
        uniqID = '_'.join([pep[0], str(pep[1]),
                           str(pep[2])
                           ])  #unique peptide defined by seq+position+length
        if uniqID not in uniqIDs:
            uniq_mut_pept.append(pep)
        uniqIDs.update([uniqID])

    len_profile = len(uniq_mut_pept)
    print 'Removed %d mutant peptides due to nonsense mutation on different miss cleavage yielded peptides.' % (
        len(mut_pept) - len_profile)
    print 'Annotated %d mutant peptides.' % (len_profile)

    pep_profile.extend(uniq_mut_pept)
    print 'Finally, annotated %d peptides.' % (len(pep_profile))

    if mut_only == 1:
        print "Export only mutant peptides..."
        pep_profile = [pep for pep in pep_profile if pep[7] != 'REF']

    records = []
    for pep in pep_profile:
        records.append(cfunc.pep_to_SeqRecord(pep))
    now = strftime("%Y%m%d-%H%M%S", localtime())
    out_db_file = output_dir + "mutPeptideDB_" + now + ".fa"
    SeqIO.write(records, out_db_file, 'fasta')

    return