def test(self, positive_file, negative_file, sequence_position=10): # for my test files sequence position = 10 test_features = [] test_labels = [] with open(positive_file) as f: for i in f: if ">" not in i and i[sequence_position] == self.amino_acid: temp_window = ProteinAnalysis( windower(i, sequence_position, self.window).strip("\t")) feat = featurify(temp_window, (2 * self.window + 1)) test_features.append(feat) test_labels.append(1) with open(negative_file) as f: for i in f: if ">" not in i and i[ sequence_position] == self.amino_acid and "X" not in i and "U" not in i: temp_window = ProteinAnalysis( windower(i, sequence_position, self.window).strip("\t")) feat = featurify(temp_window, (2 * self.window + 1)) test_features.append(feat) test_labels.append(0) temp = list(zip(test_features, test_labels)) random.shuffle(temp) test_features, test_labels = zip(*temp) test_results = self.clf.predict(test_features) #print("cross val"+str(cross_val_score(self.clf, test_features, test_labels, cv=5))) report(results=test_results, answers=test_labels, classy=self.clf)
def analyzeCleaves(self): #i used to iterate through cleave sites #j used to iterate for miss cleaves. Skips j cleave site(s) when calculating the peptide from cleave sites for i in range(len(self.sites)): end = False for j in range(self.misses+1): l = self.peptide[:self.sites[i]+1] try: r = self.peptide[self.sites[i+j+1]+1:] dp = self.peptide[self.sites[i]+1:self.sites[i+j+1]+1] except IndexError: #When code reaches this block, it means the end of the input string has been found #Set end to true to stop going through missed cleaves, no more exist r = '' dp = self.peptide[self.sites[i]+1:] end = True if i == 0: l = self.peptide[:self.sites[i+j]+1] if self.checkLenWeight(l): self.dpeps.append([l,len(l),ProteinAnalysis(str(l)).molecular_weight(),j,'',dp+r,str(1)+'-'+str(len(l))]) if self.checkLenWeight(dp): self.dpeps.append([dp,len(dp),ProteinAnalysis(str(dp)).molecular_weight(),j,l,r,str(self.sites[i]+2)+'-'+str(self.sites[i]+len(dp)+1)]) if end: break
def binaryFeatureTable(PosSeqFiles, NegSeqFiles): seqDicts = [] #add sequences from each file in positive group sequenceClass = 1 for file in PosSeqFiles: records = readfasta(file) for rec in records: seqDict = ProteinAnalysis(str(rec.seq)).get_amino_acids_percent() seqDict['Class'] = sequenceClass seqDict['Length'] = len(rec.seq) seqDict['ID'] = rec.id seqDicts.append(seqDict) #add sequences from each file in negative group sequenceClass = 0 for file in NegSeqFiles: records = readfasta(file) for rec in records: seqDict = ProteinAnalysis(str(rec.seq)).get_amino_acids_percent() seqDict['Class'] = sequenceClass seqDict['Length'] = len(rec.seq) seqDict['ID'] = rec.id seqDicts.append(seqDict) return pd.DataFrame(seqDicts)
def _toPeptide(sequence, molecule, genetic_code=1, to_stop=True): ''' Private function - Takes a sequence (DNA/RNA/amino acid) and process it according to return a ProteinAnalysis object. @param sequence String: Nucleotide (DNA/RNA) or amino acid sequence. @param molecule String: Defines the type of molecule. Three options are allowed: 'peptide' for amino acid sequences, 'DNA' for DNA sequences (requires transcription and translation), and 'RNA' for RNA sequence (requires translation). @param genetic_code Integer: Genetic code number to be used for translation. Default = 1 (Standard Code). For more information, see <https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi> @param to_stop Boolean: Flag to stop translation when first stop codon is encountered. Default = True. @return: Bio.SeqUtils.ProtParam.ProteinAnalysis object ''' if molecule.lower() == 'peptide': peptide = ProteinAnalysis(sequence) elif molecule.lower() == 'rna': rna = str(sequence) rna = Seq(rna, generic_rna) peptide = rna.translate(genetic_code, to_stop=to_stop) peptide = ProteinAnalysis(str(peptide)) elif molecule.lower() == 'dna': dna = str(sequence) dna = Seq(dna, generic_dna) rna = dna.transcribe() peptide = rna.translate(genetic_code, to_stop=to_stop) peptide = ProteinAnalysis(str(peptide)) return peptide
def gravy(sequence): if 'X' or '*' in sequence: sequence = sequence.replace('X', '') sequence = sequence.replace('*', '') g = ProteinAnalysis(sequence).gravy() else: g = ProteinAnalysis(sequence).gravy() return g
def test_molecular_weight(self): "Test Lassopeptide.molecular_weight" lasso = Lassopeptide(23, 42, 17, 51, "lead", "MAGICHATTIP") lasso.c_cut = "TIP" analysis = ProteinAnalysis("MAGICHATTIP", monoisotopic=False) cut_analysis = ProteinAnalysis("MAGICHAT", monoisotopic=False) mw = analysis.molecular_weight() - 18.02 cut_mw = cut_analysis.molecular_weight() - 18.02 self.assertAlmostEqual(mw, lasso.molecular_weight) self.assertAlmostEqual(cut_mw, lasso.cut_weight)
def _create_features_biopython(data, local=20, indiv_keys=False): from Bio.SeqUtils.ProtParam import ProteinAnalysis feature_fun = { 'molecular_weight{}': lambda pa: pa.molecular_weight(), 'iso_point{}': lambda pa: pa.isoelectric_point(), 'aromaticity{}': lambda pa: pa.aromaticity(), # 'gravy{}': lambda pa: pa.gravy(), 'instability_index{}': lambda pa: pa.instability_index(), 'flexibility{}': lambda pa: flexibility_index(pa.flexibility()), 'secondary_structure_fraction{}': lambda pa: pa.secondary_structure_fraction() } for k in feature_fun.keys(): data[k.format('')] = [] data[k.format('_localfirst')] = [] data[k.format('_locallast')] = [] for seq in data['seq']: # Global features seq = replace_selenocysteine(replace_wild_first(seq)) pa = ProteinAnalysis(seq) for k, fun in feature_fun.items(): data[k.format('')].append(fun(pa)) # Local features pa = ProteinAnalysis(seq[:local]) for k, fun in feature_fun.items(): data[k.format('_localfirst')].append(fun(pa)) pa = ProteinAnalysis(seq[-local:]) for k, fun in feature_fun.items(): data[k.format('_locallast')].append(fun(pa)) _make_numpy(data) if indiv_keys: ssf_len = data['secondary_structure_fraction'].shape[1] for i in range(ssf_len): data['secondary_structure_fraction_{}'.format( i)] = data['secondary_structure_fraction'][:, i] data['secondary_structure_fraction_localfirst_{}'.format( i)] = data['secondary_structure_fraction_localfirst'][:, i] data['secondary_structure_fraction_locallast_{}'.format( i)] = data['secondary_structure_fraction_locallast'][:, i] del data['secondary_structure_fraction'] del data['secondary_structure_fraction_localfirst'] del data['secondary_structure_fraction_locallast']
def generate_array_of_interest(enrichment_sequences_array, array_property): array_of_interest = [] for idx in range(0, len(enrichment_sequences_array)): current_sequence = enrichment_sequences_array[idx] if (str(array_property) == 'aromaticity'): array_of_interest.append( ProteinAnalysis(str(current_sequence)).aromaticity()) elif (str(array_property) == 'flexibility'): array_of_interest.append( ProteinAnalysis(str(current_sequence)).flexibility()) return array_of_interest
def calculate_pI_from_file(file, output_dir, cutoff_pi, out_CSV_pi): """Calculate the pI of all sequences in FASTA file. """ modifications = define_seq_modifications() count_sequences_done = 0 total_start_time = time.time() with open(file, "r") as handle: for record in SeqIO.parse(handle, "fasta", alphabet=IUPAC.protein): record_list = record.description.split("|") # get meta data res = get_record_meta(record_list) acc_code, organism, EC_code, species, note = res # get unmodified pI seq_obj = ProteinAnalysis(''.join(record.seq)) pi = seq_obj.isoelectric_point() count_sequences_done += 1 modifier = '0' if pi < cutoff_pi: category = '0' else: category = '1' # output to CSV output_pI_row(output_dir, out_CSV_pi, file, acc_code, organism, EC_code, species, note, pi, modifier, category) # if the category is 1 - i.e. pi > cutoff # then we test modification if category == '1': modifier = '1' # get modified pI seq = record.seq # replace target amino acid residue # with replacement amino acid residue # one letter codes targ = convert_to_one_letter_code_sing( modifications[modifier]['target_res']) replacement = convert_to_one_letter_code_sing( modifications[modifier]['replace_res']) mod_seq = ''.join(seq).replace(targ, replacement) seq_obj = ProteinAnalysis(mod_seq) pi = seq_obj.isoelectric_point() count_sequences_done += 1 if pi < cutoff_pi: category = '0' else: category = '1' # output to CSV output_pI_row(output_dir, out_CSV_pi, file, acc_code, organism, EC_code, species, note, pi, modifier, category) # break print('--- finished %s sequences in %s seconds ---' % (count_sequences_done, '{0:.2f}'.format(time.time() - total_start_time)))
def aa_analysis(df, property): if property == "ncpr": df = df[pd.notnull(df['Amino_acids'])] df[["AA1","AA2"]] = df['Amino_acids'].str.split('/',expand=True) isoelectric_point = [] for sequence in df["AA1"]: try: cdr3 = ProteinAnalysis(str(sequence)) cidercdr3 = SequenceParameters(str(sequence)) isoelectric_point.append(cidercdr3.get_NCPR()) except: isoelectric_point.append(0) pass df["AA1_Iso"] = isoelectric_point isoelectric_point2 = [] for sequence in df["AA2"]: try: cdr3 = ProteinAnalysis(str(sequence)) cidercdr3 = SequenceParameters(str(sequence)) isoelectric_point2.append(cidercdr3.get_NCPR()) except: isoelectric_point2.append(0) pass df["AA2_Iso"] = isoelectric_point2 df["AA_Iso_Delta"] = df["AA2_Iso"] - df["AA1_Iso"] df = df[["AA1_Iso", "AA2_Iso", "AA_Iso_Delta"]] elif property == "uversky_hydropathy": df = df[pd.notnull(df['Amino_acids'])] df[["AA1","AA2"]] = df['Amino_acids'].str.split('/',expand=True) isoelectric_point = [] for sequence in df["AA1"]: try: cdr3 = ProteinAnalysis(str(sequence)) cidercdr3 = SequenceParameters(str(sequence)) isoelectric_point.append(cidercdr3.get_uversky_hydropathy()) except: isoelectric_point.append(0) pass df["AA1_Iso"] = isoelectric_point isoelectric_point2 = [] for sequence in df["AA2"]: try: cdr3 = ProteinAnalysis(str(sequence)) cidercdr3 = SequenceParameters(str(sequence)) isoelectric_point2.append(cidercdr3.get_uversky_hydropathy()) except: isoelectric_point2.append(0) pass df["AA2_Iso"] = isoelectric_point2 df["AA_Iso_Delta"] = df["AA2_Iso"] - df["AA1_Iso"] df = df[["AA1_Iso", "AA2_Iso", "AA_Iso_Delta"]] return df
def find_composition(df_original): df_copy = df_original.copy() column_names = [] for ch in codes: column_names.append(ch + '_percent') column_names.append(ch + '_percent_first') column_names.append(ch + '_percent_last') column_names.append('len') column_names.append('weight') column_names.append('gravy') column_names.append('flex_mean') column_names.append('flex_std') column_names.append('ss_helix') column_names.append('ss_turn') column_names.append('ss_sheet') column_names.append('iep') column_names.append('aromaticity') df = pd.DataFrame(columns=column_names) for _, seq in enumerate(tqdm(df_copy['seq'])): df_temp = pd.Series() sequence = str(seq) analysed = ProteinAnalysis(sequence) analysed_first = ProteinAnalysis(sequence[:first_n]) analysed_last = ProteinAnalysis(sequence[-last_n:]) df_temp['len'] = analysed.length df_temp['ss_helix'], df_temp['ss_turn'], df_temp['ss_sheet'] = analysed.secondary_structure_fraction() df_temp['iep'] = analysed.isoelectric_point() # overall for aa, percent in analysed.get_amino_acids_percent().items(): df_temp[aa + '_percent'] = percent # # first N for aa, percent in analysed_first.get_amino_acids_percent().items(): df_temp[aa + '_percent_first'] = percent # last N for aa, percent in analysed_last.get_amino_acids_percent().items(): df_temp[aa + '_percent_last'] = percent df_temp['weight'] = analysed.molecular_weight() df_temp['gravy'] = analysed.gravy() df_temp['aromaticity'] = analysed.aromaticity() df_temp['flex_mean'] = np.mean(analysed.flexibility()) df_temp['flex_std'] = np.std(analysed.flexibility()) df = df.append(df_temp, ignore_index=True) return pd.concat([df_copy, df], axis=1)
def calculate_rxn_syst_pI(sequence, rxn_syst, cutoff_pi): """ Calculate the pI of a sequence associated with a reaction system. """ modifications = define_seq_modifications() seq_obj = ProteinAnalysis(sequence) pi = seq_obj.isoelectric_point() modifier = '0' if pi < cutoff_pi: category = '0' else: category = '1' if category == '0': rxn_syst.seed_MOF = True rxn_syst.pI = pi # if the category is 1 - i.e. pi > cutoff # then we test modification elif category == '1': # report unmodified pI if modification isn't successful rxn_syst.pI = pi modifier = '1' # get modified pI seq = sequence # replace target amino acid residue # with replacement amino acid residue # one letter codes targ = convert_to_one_letter_code_sing( modifications[modifier]['target_res']) replacement = convert_to_one_letter_code_sing( modifications[modifier]['replace_res']) mod_seq = ''.join(seq).replace(targ, replacement) seq_obj = ProteinAnalysis(mod_seq) pi = seq_obj.isoelectric_point() if pi < cutoff_pi: category = '0' else: category = '1' if category == '0': rxn_syst.seed_MOF = True rxn_syst.req_mod = modifier rxn_syst.pI = pi else: rxn_syst.seed_MOF = False return rxn_syst
def calc_isoelectric_point(self) -> float: """ using biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam-pysrc.html :return: calculates the sequence's isoelectric point """ protein_analysis = ProteinAnalysis(self.get_seq()) return protein_analysis.isoelectric_point()
def _protein_parameters(self, sequence): """Calculates physicochemical properties for the amino acid sequence. Args: sequence: str, amino acid sequence. Returns: property_arr: np array, vector of properties. """ analysis = ProteinAnalysis(sequence) property_arr = [] property_arr.append(analysis.molecular_weight()) property_arr.append(analysis.aromaticity()) property_arr.append(analysis.instability_index()) property_arr.append(analysis.gravy()) property_arr.append(analysis.isoelectric_point()) secondary = analysis.secondary_structure_fraction() property_arr.append(secondary[0]) property_arr.append(secondary[1]) property_arr.append(secondary[2]) molar_extinction_coefficient = analysis.molar_extinction_coefficient() property_arr.append(molar_extinction_coefficient[0]) property_arr.append(molar_extinction_coefficient[1]) property_arr.append(self._net_charge(sequence)) return np.array(property_arr)
def get_protparam(row, func_name): protein_analysis = ProteinAnalysis(row) try: param = getattr(protein_analysis, func_name)() return param except: return np.nan
def calculate_residue_features(temp_dict, sequence): analyzed_seq = ProteinAnalysis(sequence) aa_percent = analyzed_seq.get_amino_acids_percent() hydrophobicity = 0 hydrophilicity = 0 interior__surface_transfer_energy_scale = 0 surface_fractional_probability = 0 for key in aa_percent.keys(): hydrophobicity += aa_percent[key] * kd[key] hydrophilicity += aa_percent[key] * hw[key] surface_fractional_probability += aa_percent[key] * em[key] interior__surface_transfer_energy_scale += aa_percent[key] * ja[key] temp_dict.update({ "Hydrophobicity": hydrophobicity, "Hydrophilicity": hydrophilicity, "Surface Fractional Probability": surface_fractional_probability, "I2S Transfer Energy Scale": interior__surface_transfer_energy_scale }) temp_dict.update(aa_percent)
def transform(self, X): vec = np.zeros((len(X), len(VALID_AMINO_ACIDS))) for i in range(len(X)): pa = ProteinAnalysis(str(X[i])) for j, a in enumerate(VALID_AMINO_ACIDS): vec[i, j] = pa.get_amino_acids_percent().get(a, 0.0) return vec
def physchem_props(data): """Calculate the physicochemical properties per protein in ara_d.""" new_table = [] header = "ID\tclass\tindex\tsequon\tsequence\tmol_weight\tgravy\taromaticity\tinstab_index\tiso_point\n" new_table.append(header) for line in data: split_line = line.rstrip().split('\t') seq = split_line[-2] # Sequon, not sequence # Calculates the properties if "X" in seq or '*' in seq or seq == '': continue # Skip non-usable sequences, only negs try: a_seq = ProteinAnalysis(seq) # Update ara_d with new physchem properties results = [ a_seq.molecular_weight(), a_seq.gravy(), a_seq.aromaticity(), a_seq.instability_index(), #a_seq.flexibility(), a_seq.isoelectric_point(), #a_seq.secondary_structure_fraction(), ] except: print(split_line) sys.exit(1) new_line = line.rstrip() + "\t{}\t{}\t{}\t{}\t{}\n".format(*results) new_table.append(new_line) return new_table
def feat_extract(sequences): list_dict_feat = [] for sequence in sequences: protein = ProteinAnalysis(sequence) sequence_feat = defaultdict(float) sequence_len = len(sequence) sequence_feat["sequence_length"] = sequence_len sequence_feat["aromaticty"] = protein.aromaticity() sequence_feat["isoeletric_point"] = protein.isoelectric_point() #sequence_feat["flexibility"] = protein.flexibility() if ('X' not in sequence) and ('O' not in sequence) and ( 'U' not in sequence) and ('B' not in sequence): sequence_feat["molecular_weight"] = protein.molecular_weight() for letter in sequence: sequence_feat["relative_fre_{}".format(letter)] += 1 / sequence_len for property in dic_properties: if letter in dic_properties[property]: sequence_feat['freq_{}'.format(property)] += 1 for letter in sequence[0:50]: sequence_feat["relative_fre_start{}".format(letter)] += 1 / 50 for letter in sequence[-51:-1]: sequence_feat["relative_fre_end{}".format(letter)] += 1 / 50 list_dict_feat.append(sequence_feat) return list_dict_feat
def featureExtraction(train_df, test_df): #feature extraction using bio library to acquire peptide attributes n = len(train_df) Y = train_df[0] train_df = train_df.drop(columns=0) train_df = train_df.rename(columns={1: 0}) big = pd.concat([train_df, test_df], ignore_index=True) big['molecular_weight'] = 0.0 #big['flexibility'] = 0 big['isoelectric_point'] = 0.0 big['aromaticity'] = 0.0 big['stability'] = 0.0 for i in range(len(big)): #print(big.iloc[i, 0]) val = big.iloc[i, 0] #invalid peptide check, set all values to 0 if 'X' in val or 'Z' in val: big.at[i, 'molecular_weight'] = -1 #big.at[i, 'flexibility'] = -1 big.at[i, 'isoelectric_point'] = -1 big.at[i, 'aromaticity'] = -1 big.at[i, 'stability'] = -1 continue model = ProteinAnalysis(val) big.at[i, 'molecular_weight'] = model.molecular_weight() #big.at[i, 'flexibility'] = model.flexibility() big.at[i, 'isoelectric_point'] = model.isoelectric_point() big.at[i, 'aromaticity'] = model.aromaticity() big.at[i, 'stability'] = model.instability_index() big = big.drop(columns=0) train_df = big.iloc[:n, ] test_df = big.iloc[n:, ] return train_df, test_df, Y
def getProps(f): """ Code for getting the molecular weight and other properties using Biopython """ L = myPDB.loader(f) aseq = ProteinAnalysis(L.seq) return aseq.molecular_weight(), np.max(aseq.flexibility()), np.sum(L.ASA)
def get_protein_features(seq): seq = correct(seq) prot_analysis = ProteinAnalysis(seq) prot_weight = molecular_weight(seq) pI = prot_analysis.isoelectric_point() aa_count = prot_analysis.count_amino_acids() neg_charged_residues = aa_count['D'] + aa_count['E'] pos_charged_residues = aa_count['K'] + aa_count['R'] extinction_coefficient_1 = aa_count['Y'] * 1490 + aa_count['W'] * 5500 extinction_coefficient_2 = aa_count['Y'] * 1490 + aa_count[ 'W'] * 5500 + aa_count['C'] * 125 instability_idx = instability_index(seq) gravy = hydrophobicity(seq) secondary_structure_fraction = [ frac for frac in prot_analysis.secondary_structure_fraction() ] names = [ 'length', 'weight', 'pI', 'neg_charged_residues', 'pos_charged_residues', 'extinction_coeff1', 'extinction_coeff2', 'instability_index', 'gravy', 'helix', 'turn', 'sheet' ] return names, [ len(seq), prot_weight, pI, neg_charged_residues, pos_charged_residues, extinction_coefficient_1, extinction_coefficient_2, instability_idx, gravy, *secondary_structure_fraction ]
def prot_feats_seq(seq): aa=['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] f=[] X = ProteinAnalysis(str(seq)) X.molecular_weight() #throws an error if 'X' in sequence. we skip such sequences p=X.get_amino_acids_percent() dp=[] for a in aa: dp.append(p[a]) dp=np.array(dp) dp=normalize(np.atleast_2d(dp), norm='l2', copy=True, axis=1, return_norm=False) f.extend(dp[0]) tm=np.array(twomerFromSeq(str(seq))) tm=normalize(np.atleast_2d(tm), norm='l2', copy=True, axis=1,return_norm=False) f.extend(tm[0]) thm=np.array(threemerFromSeq(str(seq))) thm=normalize(np.atleast_2d(thm), norm='l2', copy=True, axis=1,return_norm=False) f.extend(thm[0]) return np.array(f)
def get_secondary_structure(self): x = ProteinAnalysis(self.sequence) sec_stru = x.secondary_structure_fraction() helix = "{0:0.2f}".format(sec_stru[0]) turn = "{0:0.2f}".format(sec_stru[1]) sheet = "{0:0.2f}".format(sec_stru[2]) return helix, turn, sheet
def prot_feats(filename): XX=[] ids=[] for rec in SeqIO.parse(filename, "fasta"): f=[] X = ProteinAnalysis(str(rec.seq)) # import pdb; pdb.set_trace() try: X.molecular_weight() #throws an error if 'X' in sequence. we skip such sequences f=list(prot_feats_seq(str(rec.seq))) # XX.append(f) ids.append(rec.id) except: # print ("exception") continue XX=np.array(XX) # import pdb; pdb.set_trace() return XX,ids
def my_own_filtering(input_file, output_file, filt_gc=45, filt_arom=0.01): sequences = {} c = 0 with open(input_file, "r") as content: for record in SeqIO.parse(content, "fasta"): c += 1 # calculate GC content using Bio calc_gc = SeqUtils.GC(record.seq) # calculate aromaticity using Bio prot_seq = record.seq.translate() X = ProteinAnalysis(str(prot_seq)) calc_arom = X.aromaticity() # so, now you can filter if calc_gc >= filt_gc and calc_arom >= filt_arom: sequences[record.id] = record.se # write a new fasta file with aminoacids records = [] for seq_id, seq in sequences.items(): records.append(SeqRecord(seq.translate(), id=seq_id, description="")) write_file = open('my_fasta', 'w') SeqIO.write(records, write_file, 'fasta') write_file.close() # print the percentage print(len(records) / c)
def protParam(seq): params = ProteinAnalysis(seq) mw = params.molecular_weight() c_aa = params.count_amino_acids() p_aa = params.get_amino_acids_percent() gravy = params.gravy() aromaticity = params.aromaticity() isoelectric_point = params.isoelectric_point() ext_coeff = sum([c_aa["W"] * 5690, c_aa["Y"] * 1280, c_aa["C"] * 120]) mgml = ext_coeff * (1. / mw) print("Amino acid count") pprint.pprint(c_aa) print("Amino acid percent") pprint.pprint(p_aa) print("Molecular weight") print("%f Da" % mw) print("Gravy") print(gravy) print("Isoelectric point") print(isoelectric_point) print("Aromaticity") print(aromaticity) print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)" % ext_coeff) print("")
def set_monoisotopic_mass(self): self._set_number_bridges() CC_mass = 2*self._num_bridges # dehydration indicative of cyclization bond = 18.02 monoisotopic_mass = ProteinAnalysis(self.core.replace('X', ''), monoisotopic=True).molecular_weight() self._monoisotopic_weight = monoisotopic_mass + CC_mass - bond
def protein_properties(seq): """Return a tuple with some protein biochemical properties seq is a Bio.Seq.Seq or str representing protein sequence """ pa = ProteinAnalysis(seq) aa_counts = pa.count_amino_acids() arom = pa.aromaticity() isoelec = pa.isoelectric_point() try: instability = pa.instability_index() except KeyError: instability = None try: gravy = pa.gravy() except KeyError: gravy = None return ProtProp(aa=str(seq), gravy=gravy, aromaticity=arom, isoelectric_point=isoelec, instability=instability, aa_counts=aa_counts)
def sequence_vector(temp_window: str, window: int = 6, chemical=1): """ This vector takes the sequence and has each amino acid represented by an int 0 represents nonstandard amino acids or as fluff for tails/heads of sequences Strip is a list which can be modified as user needs call for """ temp_window = clean(temp_window) temp_window = windower(sequence=temp_window, position=int(len(temp_window)*.5), wing_size=window) vec = [] aa = {"G": 1, "A": 2, "L": 3, "M": 4, "F": 5, "W": 6, "K": 7, "Q": 8, "E": 9, "S": 10, "P": 11, "V": 12, "I": 13, "C": 14, "Y": 15, "H": 16, "R": 17, "N": 18, "D": 19, "T": 20, "X": 0} for i in temp_window: vec.append(aa[i]) if len(vec) != (window*2)+1: t = len(vec) for i in range((window*2)+1-t): vec.append(0) # Hydrophobicity is optional if chemical == 1: s = ProteinAnalysis(temp_window) vec.append(s.gravy()) vec.append(s.instability_index()) vec.append(s.aromaticity()) return vec