def generate_multi_mut_peptide(self): max_mut = len(self.peptVarDict) var_sets = [] for i in range(max_mut): ids = list(itertools.combinations(self.peptVarDict.keys(), i + 1)) for idx in ids: candidates = [self.peptVarDict[key] for key in idx] var_sets.extend(list(itertools.product(*candidates))) self.mut_pept = {} for var_set in var_sets: var_set_list = list(var_set) var_set_list.sort(key=lambda x: x.pos) var1_set = [var.var1 for var in var_set_list] shifted_var_set = [ variant.shift_var(var.var1, self.start_pos) for var in var_set_list ] shifted_var_set = [ variant(self.pepID, var) for var in shifted_var_set ] # shifted_var_set.sort(key = lambda x:x.pos) # sort variants based on ascending positions self.mut_pept['_'.join(var1_set)] = self.multi_mutate( shifted_var_set) return
def generate_single_mut_peptide(self): self.mut_pept = {} for var in self.peptVariants: shifted_var = variant.shift_var(var.var1, self.start_pos) shifted_var = variant(self.pepID, shifted_var) self.mut_pept[var.var1] = protein.mutate(self.seq, shifted_var) return
def find_variants(self, variants, refSeq): protVariants = variants[self.transcript] self.peptVariants = [] for eachVar in protVariants: var = variant(self.transcript, eachVar) var.get_seq(refSeq) if self.start_pos <= var.pos <= self.start_pos + self.length - 1: if not var.is_RKP(): #self.peptVariants.append(var.var3) self.peptVariants.append(var) return
def build_protDB(refSeq_file, variants_file, output_dir, mut_only): variants_read = cfunc.read_cmi_var(variants_file) refSeq = cfunc.read_uniprotFa(refSeq_file) # filter variants according to refSeq. variants = {} for key, var in variants_read.iteritems(): if key in refSeq.keys(): variants[key] = var # Annotate proteins print 'Annotating proteins and mutants...' prot_profile = [] len_protein = 0 len_mutant = 0 for key in variants.keys(): eachProtein = refSeq[key] protein1 = protein(eachProtein['header'], eachProtein['seq']) prot_profile.append(protein1) len_protein += 1 matchVariants = variants[protein1.transcript] for var in matchVariants: mprotein = protein(eachProtein['header'], eachProtein['seq']) variant1 = variant(protein1.transcript, var) mprotein.add_mutation(variant1) prot_profile.append(mprotein) len_mutant += 1 len_profile = len(prot_profile) print 'Finished %d reference proteins.' % (len_protein) print 'Finished %d mutant proteins.' % (len_mutant) print 'Annotated %d peptides.\n' % (len_profile) if mut_only == 1: print "Export only mutant proteins..." prot_profile = [ prot for prot in prot_profile if prot.mutation != 'REF' ] records = [] for prot in prot_profile: records.append(cfunc.protein_to_SeqRecord(prot)) now = strftime("%Y%m%d-%H%M%S", localtime()) out_db_file = output_dir + "mutProteinDB_" + now + ".fa" SeqIO.write(records, out_db_file, 'fasta') return
def build_pepDB(refSeq_file, variants_file, output_dir, mut_only, max_miss, min_len, max_len): #max_miss = 2 #min_len = 6 #max_len = 144 variants_read = cfunc.read_cmi_var(variants_file) refSeq_read = cfunc.read_uniprotFa(refSeq_file) # filter variants to focus on the ones only associated with the longest transcript refSeq = refSeq_read # variants = variants_read # refSeq = cfunc.get_longest_records(refSeq) variants = {} for key, var in variants_read.iteritems(): if key in refSeq.keys(): variants[key] = var # obtain trypsinized peptide profiles considering mutations in R, K, P that affect trypsin digestion print 'Annotating reference peptides...' pep_profile = [] len_protein = 0 for key in variants.keys(): eachProtein = refSeq[key] protein1 = protein(eachProtein['header'], eachProtein['seq']) protein1.add_trypsin_profile(max_miss=max_miss) protein1.annotate_pep() matchVariants = variants[protein1.transcript] for var in matchVariants: variant1 = variant(protein1.transcript, var) variant1.get_seq(refSeq) if variant1.is_RKP( ): # check whether mutation will affect trypsin digestion protein1.annotate_pepvar(variant1, max_miss=max_miss) protein1.exclude_len(min_len=min_len, max_len=max_len) pep_profile.extend(protein1.trypsin_profile) len_protein += 1 len_profile = len(pep_profile) if len_protein % 100 == 0: print 'Finished %d reference proteins.' % (len_protein) print 'Annotated %d reference peptides.' % (len_profile) print '\nFinished %d reference proteins.' % (len_protein) print 'Annotated %d reference peptides.\n' % (len_profile) # generate all peptides with single mutations at trypsinized peptide level print 'Annotating mutant peptides...' mut_pept = [] len_peptide = 0 for pep in pep_profile: peptide1 = peptide(pep) if peptide1.mutation == 'REF': # exclude mutants that affect trypsin digestion peptide1.find_variants(variants, refSeq) peptide1.generate_single_mut_peptide() peptide1.annotate() peptide1.exclude_len(min_len=min_len, max_len=max_len) mut_pept.extend(peptide1.annotated_profile) len_peptide += 1 len_profile = len(mut_pept) if len_peptide % 10000 == 0: print 'Finished %d reference peptides.' % (len_peptide) print 'Annotated %d mutations.' % (len_profile) print '\nFinished %d reference peptides.' % (len_peptide) print 'Annotated %d mutations.\n' % (len_profile) # remove duplicates due to the same nonsense mutation on different miss cleavage yielded peptides print 'Removing replicates...' uniqIDs = set() uniq_mut_pept = [] for pep in mut_pept: uniqID = '_'.join([pep[0], str(pep[1]), str(pep[2]) ]) #unique peptide defined by seq+position+length if uniqID not in uniqIDs: uniq_mut_pept.append(pep) uniqIDs.update([uniqID]) len_profile = len(uniq_mut_pept) print 'Removed %d mutant peptides due to nonsense mutation on different miss cleavage yielded peptides.' % ( len(mut_pept) - len_profile) print 'Annotated %d mutant peptides.' % (len_profile) pep_profile.extend(uniq_mut_pept) print 'Finally, annotated %d peptides.' % (len(pep_profile)) if mut_only == 1: print "Export only mutant peptides..." pep_profile = [pep for pep in pep_profile if pep[7] != 'REF'] records = [] for pep in pep_profile: records.append(cfunc.pep_to_SeqRecord(pep)) now = strftime("%Y%m%d-%H%M%S", localtime()) out_db_file = output_dir + "mutPeptideDB_" + now + ".fa" SeqIO.write(records, out_db_file, 'fasta') return