def biopython_protein_analysis(inseq): """Utiize Biopython's ProteinAnalysis module to return general sequence properties of an amino acid string. For full definitions see: http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam.ProteinAnalysis-class.html Args: inseq: Amino acid sequence Returns: dict: Dictionary of sequence properties. Some definitions include: instability_index: Any value above 40 means the protein is unstable (has a short half life). secondary_structure_fraction: Percentage of protein in helix, turn or sheet TODO: Finish definitions of dictionary """ inseq = ssbio.protein.sequence.utils.cast_to_str(inseq) analysed_seq = ProteinAnalysis(inseq) info_dict = {} info_dict['amino_acids_content-biop'] = analysed_seq.count_amino_acids() info_dict[ 'amino_acids_percent-biop'] = analysed_seq.get_amino_acids_percent() info_dict['length-biop'] = analysed_seq.length info_dict['monoisotopic-biop'] = analysed_seq.monoisotopic info_dict['molecular_weight-biop'] = analysed_seq.molecular_weight() info_dict['aromaticity-biop'] = analysed_seq.aromaticity() info_dict['instability_index-biop'] = analysed_seq.instability_index() # TODO: What is flexibility? info_dict['flexibility-biop'] = analysed_seq.flexibility() info_dict['isoelectric_point-biop'] = analysed_seq.isoelectric_point() # grand average of hydrophobicity info_dict['gravy-biop'] = analysed_seq.gravy() # Separated secondary_structure_fraction into each definition # info_dict['secondary_structure_fraction-biop'] = analysed_seq.secondary_structure_fraction() info_dict[ 'percent_helix_naive-biop'] = analysed_seq.secondary_structure_fraction( )[0] info_dict[ 'percent_turn_naive-biop'] = analysed_seq.secondary_structure_fraction( )[1] info_dict[ 'percent_strand_naive-biop'] = analysed_seq.secondary_structure_fraction( )[2] return info_dict
def phyChemProps(seq): svv = [0 for x in range(10)] X = ProteinAnalysis(seq) svv[0] = X.aromaticity() svv[1] = X.secondary_structure_fraction()[0] svv[2] = X.secondary_structure_fraction()[1] svv[3] = X.secondary_structure_fraction()[2] svv[4] = X.gravy() svv[5] = X.instability_index() svv[6] = X.isoelectric_point() svv[7] = X.molecular_weight() svv[8] = X.molar_extinction_coefficient()[0] svv[9] = X.molar_extinction_coefficient()[1] return svv
def calculate_physiochemical_features(temp_dict, sequence): analyzed_seq = ProteinAnalysis(sequence) charge_at_pH7 = analyzed_seq.charge_at_pH(7) instability_index = analyzed_seq.instability_index() molecular_weight = analyzed_seq.molecular_weight() aromaticity = analyzed_seq.aromaticity() molar_extinction_coefficient = analyzed_seq.molar_extinction_coefficient() range_l, range_h = molar_extinction_coefficient molar_extinction_coefficient = (float(range_l) + float(range_h)) / 2 gravy = analyzed_seq.gravy( ) #Grand Average Hyrdopathy - Higher value = More Hydrophobic isoelectric_point = analyzed_seq.isoelectric_point() helix_fraction, turn_fraction, sheet_fraction = analyzed_seq.secondary_structure_fraction( ) physiochem_dict = { "Charge at pH7": charge_at_pH7, "Instability Index": instability_index, "Molecular Wt": molecular_weight, "Aromaticity": aromaticity, "Molar Extinction Coeff": molar_extinction_coefficient, "Gravy": gravy, "Isoelectric pt": isoelectric_point, "Helix Fraction": helix_fraction, "Turn Fraction": turn_fraction, "Sheet Fraction": sheet_fraction } temp_dict.update(physiochem_dict) #Adding separately because get_amino_acids_percent() generates a dictionary on its own aa_percent = analyzed_seq.get_amino_acids_percent() temp_dict.update(aa_percent)
def protAnalysis(self, content): result, resultFlexDic = dict(), dict() content = Parsers.normalizeSequence(content, self.sourceType) protein = ProteinAnalysis(content) result['proteinMWeight'] = protein.molecular_weight() result['proteinAroma'] = protein.aromaticity() result['proteinInstab'] = protein.instability_index() result['proteinIsoelec'] = protein.isoelectric_point() result['proteinGravy'] = protein.gravy() proteinStructure = protein.secondary_structure_fraction() protStruct = self.flatten('proteinSecstruc', proteinStructure) result = {**protStruct, **result} # merge result and protein Structure flexibility = protein.flexibility() flexibFlat = self.flatten('proteinFlex', flexibility) flexibAmino = self.flatten(list(content), flexibility) flattened = {**flexibFlat, **result} flattenedFlexDic = {**flexibAmino, **result} return result, flattened, flattenedFlexDic,
def prot_param_features(seq): features = {} pa = ProteinAnalysis(str(seq.seq)) # .replace('X','G').replace('B','A') # 1. Amino Acid Percent aa = pa.get_amino_acids_percent() aa_dict = {"frac_{}".format(k): v for k, v in aa.items()} features.update(aa_dict) # 2. Aromaticity features["aromaticity"] = pa.aromaticity() # 3. Isoelectric Point features["isoelectric"] = pa.isoelectric_point() # 4. Molecular Weight try: features["mol_weight"] = pa.molecular_weight() except ValueError: replaced = str(seq.seq).replace('X', 'G').replace('B', 'N') # 5. Flexibility # try: # features["flexibility"] = np.mean(pa.flexibility()) # except KeyError: # replaced = str(seq.seq).replace('X', 'G').replace('B', 'N').replace('U','C') # features["flexibility"] = np.mean(ProteinAnalysis(replaced).flexibility()) # 6. Secondary Structure Fraction struc = ["struc_helix", "struc_turn", "struc_sheet"] ss = pa.secondary_structure_fraction() features.update(dict(zip(struc, ss))) return features
def _protein_parameters(self, sequence): """Calculates physicochemical properties for the amino acid sequence. Args: sequence: str, amino acid sequence. Returns: property_arr: np array, vector of properties. """ analysis = ProteinAnalysis(sequence) property_arr = [] property_arr.append(analysis.molecular_weight()) property_arr.append(analysis.aromaticity()) property_arr.append(analysis.instability_index()) property_arr.append(analysis.gravy()) property_arr.append(analysis.isoelectric_point()) secondary = analysis.secondary_structure_fraction() property_arr.append(secondary[0]) property_arr.append(secondary[1]) property_arr.append(secondary[2]) molar_extinction_coefficient = analysis.molar_extinction_coefficient() property_arr.append(molar_extinction_coefficient[0]) property_arr.append(molar_extinction_coefficient[1]) property_arr.append(self._net_charge(sequence)) return np.array(property_arr)
def get_secondary_structure(self): x = ProteinAnalysis(self.sequence) sec_stru = x.secondary_structure_fraction() helix = "{0:0.2f}".format(sec_stru[0]) turn = "{0:0.2f}".format(sec_stru[1]) sheet = "{0:0.2f}".format(sec_stru[2]) return helix, turn, sheet
def get_sec_struct(self): """ Calculates the fraction of amino acids which tend to be in helix, turn or sheet (3 value) from biopython :return: dictionary with the 3 value of helix, turn, sheet """ res = {} analysed_seq = ProteinAnalysis(self.ProteinSequence) res['SecStruct_helix'] = analysed_seq.secondary_structure_fraction()[ 0] # helix res['SecStruct_turn'] = analysed_seq.secondary_structure_fraction()[ 1] # turn res['SecStruct_sheet'] = analysed_seq.secondary_structure_fraction()[ 2] # sheet return res
def get_protein_features(seq): seq = correct(seq) prot_analysis = ProteinAnalysis(seq) prot_weight = molecular_weight(seq) pI = prot_analysis.isoelectric_point() aa_count = prot_analysis.count_amino_acids() neg_charged_residues = aa_count['D'] + aa_count['E'] pos_charged_residues = aa_count['K'] + aa_count['R'] extinction_coefficient_1 = aa_count['Y'] * 1490 + aa_count['W'] * 5500 extinction_coefficient_2 = aa_count['Y'] * 1490 + aa_count[ 'W'] * 5500 + aa_count['C'] * 125 instability_idx = instability_index(seq) gravy = hydrophobicity(seq) secondary_structure_fraction = [ frac for frac in prot_analysis.secondary_structure_fraction() ] names = [ 'length', 'weight', 'pI', 'neg_charged_residues', 'pos_charged_residues', 'extinction_coeff1', 'extinction_coeff2', 'instability_index', 'gravy', 'helix', 'turn', 'sheet' ] return names, [ len(seq), prot_weight, pI, neg_charged_residues, pos_charged_residues, extinction_coefficient_1, extinction_coefficient_2, instability_idx, gravy, *secondary_structure_fraction ]
def protein_analysis(): if session.username == None: redirect(URL(r=request, c='account', f='log_in')) from Bio.SeqUtils.ProtParam import ProteinAnalysis form = FORM( TABLE( TR( "Amino acid sequence: ", TEXTAREA(_type="text", _name="sequence", requires=IS_NOT_EMPTY())), INPUT(_type="submit", _value="SUBMIT"))) if form.accepts(request.vars, session): session['sequence'] = seqClean(form.vars.sequence.upper()) X = ProteinAnalysis(session['sequence']) session['aa_count'] = X.count_amino_acids() session['percent_aa'] = X.get_amino_acids_percent() session['mw'] = X.molecular_weight() session['aromaticity'] = X.aromaticity() session['instability'] = X.instability_index() session['flexibility'] = X.flexibility() session['pI'] = X.isoelectric_point() session['sec_struct'] = X.secondary_structure_fraction() redirect(URL(r=request, f='protein_analysis_output')) return dict(form=form)
def get_protein_analysis(aa): protein_analysis = ProteinAnalysis(aa) analyze = [protein_analysis.molecular_weight(), protein_analysis.aromaticity(), protein_analysis.instability_index(), protein_analysis.isoelectric_point(), protein_analysis.gravy()] + list( protein_analysis.secondary_structure_fraction()) return analyze
def secondary_structure(self, record): ''' Input: - record: a SeqRecord Output: - tuple of integers ''' PA = ProteinAnalysis(str(record.seq)) return PA.secondary_structure_fraction()
def get_biopython_features(X): res = np.zeros((X.shape[0], 6)) for i, seq in enumerate(X): analysed_seq = ProteinAnalysis(seq) res[i] = np.array([analysed_seq.molecular_weight()] + [analysed_seq.instability_index()] + [analysed_seq.isoelectric_point()] + list(analysed_seq.secondary_structure_fraction())) return res
def processSeq(seq): ''' Protein features found: - Sequence Length - Amino Acid Composition (global) - Amino Acid Composition (First 50/Last 50) - Isoelectric Point - Aromacity - Grand Average Hydropathy (Gravy) - Molecular Weight (global) - Molecular Weight (First 50/Last 50) - Secondary Structure Fraction ''' # seq = str(seq_record.seq) prot = ProteinAnalysis(seq) # desc = str(seq_record.description).split('_') # species = desc[1].split(' ')[0] seq_length = len(seq) isoelectric = prot.isoelectric_point() gravy = calculateGravy(seq, 0, seq_length) aroma = prot.aromaticity() ss_frac = prot.secondary_structure_fraction() mol_global_weight = calculateMolecularWeight(seq, 0, seq_length) AA_global_dist = getAAPercent(seq, 0, seq_length) flex_global = calculateFlexibility(seq, 0, seq_length) if (seq_length > 50): AA_local_head = getAAPercent(seq, 0, 50) AA_local_tail = getAAPercent(seq, seq_length - 50, seq_length) mol_local_weight_head = calculateMolecularWeight(seq, 0, 50) mol_local_weight_tail = calculateMolecularWeight( seq, seq_length - 50, seq_length) flex_localh = calculateFlexibility(seq, 0, 50) flex_localt = calculateFlexibility(seq, seq_length - 50, seq_length) else: AA_local_head = AA_global_dist AA_local_tail = AA_global_dist mol_local_weight_head = mol_global_weight mol_local_weight_tail = mol_global_weight flex_localh = flex_global flex_localt = flex_global return_vector = [seq_length,aroma, isoelectric, mol_global_weight, mol_local_weight_head, mol_local_weight_tail, gravy,flex_global, flex_localh, flex_localt] + \ AA_global_dist + AA_local_head + AA_local_tail + list(ss_frac) # print seq_length, GC_distribution, mol_weight, aroma, isoelectric return return_vector
def make_dataset(fasta): # a list of dictionaries containing features for all sequences ls_features = [] # assign whether it's from tardigrades 'tar' or poplars 'pop' if 'tar' in fasta: target = 0 elif 'pop' in fasta: target = 1 for record in SeqIO.parse(fasta, "fasta"): analysed_seq = ProteinAnalysis(str(record.seq)) # the dictionary containing features for a single sequence dict_features = {} # compute length dict_features['length'] = len(record.seq) # compute molecular weight dict_features['mol_weight'] = analysed_seq.molecular_weight() # compute aromaticity dict_features['aromaticity'] = analysed_seq.molecular_weight() # compute stability dict_features['stability'] = analysed_seq.instability_index() # compute flexibility dict_features['flexibility'] = analysed_seq.flexibility() # compute isoelectric point dict_features['isoelectric'] = analysed_seq.isoelectric_point() # compute secondary structure fraction frac = analysed_seq.secondary_structure_fraction() dict_features['helix'] = frac[0] dict_features['turn'] = frac[1] dict_features['sheet'] = frac[2] # compute AAC composition of entire sequence aac = analysed_seq.get_amino_acids_percent() # merge all features and dictionaries into dict_features dict_features.update(aac) ls_features += [dict_features] df = pd.DataFrame(ls_features) df['target'] = target print(df) df.to_pickle(name + '_set.pkl')
def find_composition(df_original): df_copy = df_original.copy() column_names = [] for ch in codes: column_names.append(ch + '_percent') column_names.append(ch + '_percent_first') column_names.append(ch + '_percent_last') column_names.append('len') column_names.append('weight') column_names.append('gravy') column_names.append('flex_mean') column_names.append('flex_std') column_names.append('ss_helix') column_names.append('ss_turn') column_names.append('ss_sheet') column_names.append('iep') column_names.append('aromaticity') df = pd.DataFrame(columns=column_names) for _, seq in enumerate(tqdm(df_copy['seq'])): df_temp = pd.Series() sequence = str(seq) analysed = ProteinAnalysis(sequence) analysed_first = ProteinAnalysis(sequence[:first_n]) analysed_last = ProteinAnalysis(sequence[-last_n:]) df_temp['len'] = analysed.length df_temp['ss_helix'], df_temp['ss_turn'], df_temp['ss_sheet'] = analysed.secondary_structure_fraction() df_temp['iep'] = analysed.isoelectric_point() # overall for aa, percent in analysed.get_amino_acids_percent().items(): df_temp[aa + '_percent'] = percent # # first N for aa, percent in analysed_first.get_amino_acids_percent().items(): df_temp[aa + '_percent_first'] = percent # last N for aa, percent in analysed_last.get_amino_acids_percent().items(): df_temp[aa + '_percent_last'] = percent df_temp['weight'] = analysed.molecular_weight() df_temp['gravy'] = analysed.gravy() df_temp['aromaticity'] = analysed.aromaticity() df_temp['flex_mean'] = np.mean(analysed.flexibility()) df_temp['flex_std'] = np.std(analysed.flexibility()) df = df.append(df_temp, ignore_index=True) return pd.concat([df_copy, df], axis=1)
def get_structure_perc(seq, structure="helix"): """ """ bio_seq = ProteinAnalysis(seq) helix, turn, sheets = bio_seq.secondary_structure_fraction() if structure == "helix": return (helix) elif structure == "turn": return (turn) else: return (sheets)
def bio_feat(record): clean_seq = str(MutableSeq(record.seq)).replace("X", "") clean_seq = clean_seq.replace("U", "C") clean_seq = clean_seq.replace("B", "N") clean_seq = clean_seq.replace('Z', 'Q') clean_seq = MutableSeq(clean_seq).toseq() ### features seq_length = len(str(clean_seq)) analysed_seq = ProteinAnalysis(str(clean_seq)) molecular_weight = analysed_seq.molecular_weight() amino_percent = analysed_seq.get_amino_acids_percent().values() isoelectric_points = analysed_seq.isoelectric_point() count = analysed_seq.count_amino_acids().values() # aromaticity = analysed_seq.aromaticity() instability_index = analysed_seq.instability_index() # hydrophobicity = analysed_seq.protein_scale(ProtParamData.kd, 5, 0.4) secondary_structure_fraction = analysed_seq.secondary_structure_fraction() return np.array([seq_length, molecular_weight, isoelectric_points, instability_index] + list(secondary_structure_fraction) + list(count) + list(amino_percent))
def protein_analysis(): if session.username == None: redirect(URL(r=request,f='../account/log_in')) from Bio.SeqUtils.ProtParam import ProteinAnalysis form = FORM(TABLE( TR("Amino acid sequence: ", TEXTAREA(_type="text", _name="sequence", requires=IS_NOT_EMPTY())), INPUT(_type="submit", _value="SUBMIT"))) if form.accepts(request.vars,session): session['sequence'] = seqClean(form.vars.sequence.upper()) X = ProteinAnalysis(session['sequence']) session['aa_count'] = X.count_amino_acids() session['percent_aa'] = X.get_amino_acids_percent() session['mw'] = X.molecular_weight() session['aromaticity'] = X.aromaticity() session['instability'] = X.instability_index() session['flexibility'] = X.flexibility() session['pI'] = X.isoelectric_point() session['sec_struct'] = X.secondary_structure_fraction() redirect(URL(r=request, f='protein_analysis_output')) return dict(form=form)
def get_features(seq): """get global features from a protein sequence Parameters ---------- seq : str protein sequence Return ---------- dictionary: global features of the protein sequence """ features = {} features['undefined_count'] = len([x for x in seq if x in ['X','B','Z',"'",'O','U']]) features['length'] = len(seq) features['perc_undefined_count'] = features['undefined_count']/features['length'] features['entropy'] = entropy(seq) features['ideal_entropy'] = entropy_ideal(len(seq)) features['perc_entropy'] = features['entropy']/features['ideal_entropy'] features['hydr_count'] = sum(1 for x in seq if x in hydrophobic_proteins) features['polar_count'] = sum(1 for x in seq if x in polar_proteins) features['buried'] = sum(buried[x] for x in seq if x in hydrophobic_proteins) seq = ''.join([x for x in seq if x not in ['X','B','Z',"'",'O','U']]) protein = ProteinAnalysis(seq) features['gravy'] = protein.gravy() features['molecular_weight'] = protein.molecular_weight() features['aromaticity'] = protein.aromaticity() features['instability_index'] = protein.instability_index() features['isoelectric_point'] = protein.isoelectric_point() features['helix'], features['turn'], features['sheet'] = protein.secondary_structure_fraction() features.update(protein.count_amino_acids()) # features.update(protein.get_amino_acids_percent()) return features
def GetFeatures (My_seq): Features = {} ProteinAnalysis(My_seq) analysed_seq = ProteinAnalysis(My_seq) #Caracteristicas monovaloradas Features["Molecular_weight"] = analysed_seq.molecular_weight() Features["Aromaticity"] = analysed_seq.aromaticity() Features["Instability_index"] = analysed_seq.instability_index() Features["Isoelectric_point"] = analysed_seq.isoelectric_point() #Caracteristicas multivaloradas Features["Flexibility"] = analysed_seq.flexibility() # List 580 Features["Second_structure_fraction"] = analysed_seq.secondary_structure_fraction() #3 Tupla Features["Count_amino_acids"] = analysed_seq.count_amino_acids() #20 Dict Features["Amino_acids_percent"] = analysed_seq.get_amino_acids_percent() #20 Dict return Features
def seqs_to_features(self, seqs, no_seqs): """ Extract the features from the sequences.""" X = np.zeros((no_seqs, 32)) for i, s in enumerate(chain(*seqs)): # iterate over all sequences # get amino acid counts alphabet = 'ABCDEFGHIKLMNPQRSTUVWXY' # no JOZ for j, letter in enumerate(alphabet): X[i, j] = s.count(letter) / len(s) # other analysis analysis = ProteinAnalysis( s.replace('X', 'A').replace('B', 'A').replace('U', 'A')) X[i, -1] = analysis.molecular_weight() X[i, -2] = analysis.aromaticity() X[i, -3] = analysis.instability_index() X[i, -4] = analysis.isoelectric_point() helix_array_sheet_fracs = analysis.secondary_structure_fraction() X[i, -5] = helix_array_sheet_fracs[0] X[i, -6] = helix_array_sheet_fracs[1] X[i, -7] = helix_array_sheet_fracs[2] X[i, -8] = len(s) X[i, -9] = analysis.gravy() # mean hydrophobicity return X
def physchem_props(ara_d): """Calculate the physicochemical properties per protein in ara_d.""" c = 0 g = 0 for protein in ara_d: seq = ara_d[protein]["sequence"] # Calculates the properties if "X" in seq: continue # Skip non-usable sequences, only negs if '*' in seq: if ara_d[protein]["pos"] != []: print(protein) continue a_seq = ProteinAnalysis(seq) # Update ara_d with new physchem properties results = [ a_seq.molecular_weight(), a_seq.gravy(), a_seq.aromaticity(), a_seq.instability_index(), a_seq.flexibility(), a_seq.isoelectric_point(), a_seq.secondary_structure_fraction(), ] keys = [ "mol_weight", "gravy", "aromaticity", "instab_index", "flexi", "iso_point", "seq_struct", ] ara_d[protein]["Properties"] = {} for k, v in zip(keys, results): ara_d[protein]["Properties"][k] = v return ara_d
def add_protein_characteristics(df): df = df.copy() aa_list = [ 'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y' ] aa_dict = {} for aa in aa_list: aa_dict[aa] = [] prop_dict = { 'aromaticity': [], 'helix': [], 'turn': [], 'sheet': [], 'isoelectric_point': [], 'gravy': [] } #, 'flexibility': [], 'instability_index': []} for i, s in enumerate(df['sequence']): s = s.replace('B', 'D').replace('Z', 'E').replace('J', 'L').replace( 'X', 'G').replace('U', 'C').replace('O', 'K') pa = ProteinAnalysis(s) prop_dict['aromaticity'].append(pa.aromaticity()) prop_dict['isoelectric_point'].append(pa.isoelectric_point()) prop_dict['gravy'].append(pa.gravy()) # prop_dict['instability_index'].append(pa.instability_index()) # prop_dict['flexibility'].append(np.mean(pa.flexibility())) for fraction, ss in zip(pa.secondary_structure_fraction(), ['helix', 'turn', 'sheet']): prop_dict[ss].append(fraction) for k, v in pa.get_amino_acids_percent().items(): aa_dict[k].append(v) for k, v in aa_dict.items(): df[k] = v for k, v in prop_dict.items(): df[k] = v return df
def biopython_proteinanalysis_seq(seq, scaling=False): res = ProteinAnalysis(seq) d = {} flex = np.array(res.flexibility()) d['flex:min'], d['flex:max'], d['flex:std'] = flex.min(), flex.max( ), flex.std() d['gravy'] = res.gravy() d['instability_index'] = res.instability_index() d['isoelectric_point'] = res.isoelectric_point() r, c = res.molar_extinction_coefficient() d['molar_extinction_coefficient_reduced'], d[ 'molar_extinction_coefficient_cysteines'] = r, c d['molecular_weight'] = res.molecular_weight() d['percent_helix_naive'], d['percent_turn_naive'], d[ 'percent_strand_naive'] = res.secondary_structure_fraction() aap = res.get_amino_acids_percent() aas = sorted(aap.keys()) d.update({'percent:%s' % aa: aap[aa] for aa in aas}) d.update({ 'prop_res_%s' % key: sum([aap.get(x, 0) for x in value]) for key, value in list(property_residues.items()) }) return d
def openfile(): global prob, probab, te global my_seq global anti global structure, structure_id, filename global antigenicity, hydro, flex, sec global m, a, c, b, length, j, k global hydroph, flexi, access anti = [] sec = [] probab = [] from tkinter import filedialog root = Tk() root.filename = filedialog.askopenfilename( initialdir="/", title="Select file", filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb"))) filename = root.filename print(filename) structure_id = "1e6j" structure = PDBParser().get_structure(structure_id, root.filename) ppb = PPBuilder() for pp in ppb.build_peptides(structure): my_seq = pp.get_sequence() # type: Seq print(my_seq) for model in structure: for chain in model: print(chain) sequence = list(my_seq) m = ''.join(sequence) print(m) length = len(m) # type: int print("Sequence consist of", length, "Amino Acids") from Bio.SeqUtils.ProtParam import ProteinAnalysis analysed_seq = ProteinAnalysis(m) print("Molecular weight = ", analysed_seq.molecular_weight()) print("Amino Acid Count = ", analysed_seq.count_amino_acids()) print("Secondary structure fraction =", analysed_seq.secondary_structure_fraction()) kd = { 'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2 } c = list(analysed_seq.flexibility()) b = list(analysed_seq.protein_scale(kd, 10, 1.0)) hydro = list(analysed_seq.protein_scale(kd, 10, 1.0)) flex = list(analysed_seq.flexibility()) hydroph = list(analysed_seq.protein_scale(kd, 10, 1.0)) flexi = list(analysed_seq.flexibility()) i = 1 j = -1 # type: int k = 9 while i <= (length - 10): print("Sequence is = ", m[j + 1:k + 1]) print("Flexibility value = ", c[j + 1]) print("Hydrophilicity value = ", b[j + 1]) ana_seq = ''.join(m[j + 1:k + 1]) analyze_seq = ProteinAnalysis(ana_seq) # For Secondary structure Analysis print("Secondary structure fraction =", analyze_seq.secondary_structure_fraction()) a = list(analyze_seq.secondary_structure_fraction()) a = a[0] sec.append(a) i += 1 j += 1 k += 1 f = length r = 1 y = 10 global acc, logacc acc = [] for i in range(0, f): str1 = "accessibility, resi " str2 = str(r) + "-" + str(y) saving = str1 + str2 print(saving) r = r + 1 y = y + 1 structure = freesasa.Structure("1e6j.pdb") resulta = freesasa.calc(structure) area_classes = freesasa.classifyResults(resulta, structure) print("Total : %.2f A2" % resulta.totalArea()) for key in area_classes: print(key, ": %.2f A2" % area_classes[key]) resulta = freesasa.calc( structure, freesasa.Parameters({ 'algorithm': freesasa.LeeRichards, 'n-slices': 10 })) selections = freesasa.selectArea(('alanine, resn ala', saving), structure, resulta) for key in selections: print(key, ": %.2f A2" % selections[key]) a = selections[key] acc.append(a) l = acc[0::2] access = l print(acc) print(l) logacc = [math.log(y, 10) for y in l] print(logacc)
from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.SeqUtils import ProtParamData from Bio import SeqIO with open('../../samples/pdbaa') as fh: for rec in SeqIO.parse(fh,'fasta'): myprot = ProteinAnalysis(str(rec.seq)) print(myprot.count_amino_acids()) print(myprot.get_amino_acids_percent()) print(myprot.molecular_weight()) print(myprot.aromaticity()) print(myprot.instability_index()) print(myprot.flexibility()) print(myprot.isoelectric_point()) print(myprot.secondary_structure_fraction()) print(myprot.protein_scale(ProtParamData.kd, 9, .4))
aromaticity = [] instability_index = [] # flexibility = [] isoelectric_point = [] secondary_structure_fraction = [] for protein in sequences_a: analysed_seq = ProteinAnalysis(str(protein.seq).replace("X", "")) gravy_index.append([sys.argv[1], analysed_seq.gravy()]) aromaticity.append([sys.argv[1], analysed_seq.aromaticity()]) instability_index.append([sys.argv[1], analysed_seq.instability_index()]) # flexibility.append([sys.argv[1], analysed_seq.flexibility()]) isoelectric_point.append([sys.argv[1], analysed_seq.isoelectric_point()]) secondary_structure_fraction.append( [sys.argv[1], analysed_seq.secondary_structure_fraction()]) for protein in sequences_b: analysed_seq = ProteinAnalysis(str(protein.seq).replace("X", "")) gravy_index.append([sys.argv[2], analysed_seq.gravy()]) aromaticity.append([sys.argv[2], analysed_seq.aromaticity()]) instability_index.append([sys.argv[2], analysed_seq.instability_index()]) # flexibility.append([sys.argv[2], analysed_seq.flexibility()]) isoelectric_point.append([sys.argv[2], analysed_seq.isoelectric_point()]) secondary_structure_fraction.append( [sys.argv[2], analysed_seq.secondary_structure_fraction()]) # Box plot showing gravy indexes gravy_index = pd.DataFrame(gravy_index, columns=["Filename", "Gravy Index"])
def openfile(): global my_seq global antigenicity global m, a, c, b from tkinter import filedialog root = Tk() root.filename = filedialog.askopenfilename( initialdir="/", title="Select file", filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb"))) print(root.filename) structure_id = "1e6j" structure = PDBParser().get_structure(structure_id, root.filename) ppb = PPBuilder() for pp in ppb.build_peptides(structure): my_seq = pp.get_sequence() # type: Seq print(my_seq) for model in structure: for chain in model: print(chain) sequence = list(my_seq) m = ''.join(sequence) # type: str print(m) length = len(m) # type: int print(length) print("Sequence consist of", len(m), "Amino Acids") from Bio.SeqUtils.ProtParam import ProteinAnalysis analysed_seq = ProteinAnalysis(m) print("Molecular weight = ", analysed_seq.molecular_weight()) print("Amino Acid Count = ", analysed_seq.count_amino_acids()) print("Secondary structure fraction =", analysed_seq.secondary_structure_fraction()) kd = { 'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2 } c = list(analysed_seq.flexibility()) b = list(analysed_seq.protein_scale(kd, 10, 1.0)) i = 1 j = -1 # type: int k = 9 while i <= (length - 10): print("Sequence is = ", m[j + 1:k + 1]) print("Flexibility value = ", c[j + 1]) print("Hydrophilicity value = ", b[j + 1]) ana_seq = ''.join(m[j + 1:k + 1]) analyze_seq = ProteinAnalysis(ana_seq) # For Secondary structure Analysis print("Secondary structure fraction =", analyze_seq.secondary_structure_fraction()) a = list(analyze_seq.secondary_structure_fraction()) global tupleall tupleall = (m[j + 1:k + 1], c[j + 1], b[j + 1], a) print(tupleall[0], tupleall[2], tupleall[1], tupleall[3]) i = i + 1 if a[0] >= a[1]: a[0] = 1 else: a[0] = a[1] # For Hydrophilicity if b[j + 1] > 0.5: b[j + 1] = 2 elif b[j + 1] < 0.5 or b[j + 1] > 0: b[j + 1] = 1 elif b[j + 1] > 0 or b[j + 1] > -0.4: b[j + 1] = -1 elif b[j + 1] < -0.4: b[j + 1] = -2 else: b[j + 1] = 0 # For Flexibility if c[j + 1] > 1.0: c[j + 1] = 1 else: c[j + 1] = 0 # For antigenicity Index antigenicity = 0.3 * b[j + 1] + 0.15 * 1 + 0.15 * c[j + 1] + 0.2 * a[0] print("antigenicity", antigenicity) j += 1 k += 1
zip_.extractall(tmpdir.path) for test_pdb in tmpdir.path.glob("*.pdb"): for record in SeqIO.parse(test_pdb, "pdb-atom"): sequence = str(record.seq).replace('X', 'G') protein = ProteinAnalysis(str(sequence)) p_len.append(len(sequence)) mol_w.append(protein.molecular_weight()) iso_p.append(protein.isoelectric_point()) smell.append(protein.aromaticity()) taste_factor.append(protein.gravy()) insta_ind.append(protein.instability_index()) char_at_acid.append(protein.charge_at_pH(1)) char_at_neutral.append(protein.charge_at_pH(7)) char_at_base.append(protein.charge_at_pH(14)) helter_skeler.append(protein.secondary_structure_fraction()[0]) turnip.append(protein.secondary_structure_fraction()[1]) garfield.append(protein.secondary_structure_fraction()[2]) for x in amino_acids: n = protein.count_amino_acids()[x] for y in d_count.keys(): if y[-1] == x: d_count[y].append(n) for a in amino_acids: m = protein.get_amino_acids_percent()[a] for b in d_perc.keys(): if b[-1] == a: d_perc[b].append(m) #areas = get_area_classes(test_pdb) #polar_area.append(areas[0]) #apolar_area.append(areas[1])
aminoPercent=[] secstruct=[] hydrophob=[] hydrophil=[] surface=[] gravy=[] molweight=[] instidx=[] flex=[] for seq in sequences: X=ProteinAnalysis(str(seq)) isoelectricPt.append(X.isoelectric_point()) aromaticity.append(X.aromaticity()) aminoPercent.append(X.get_amino_acids_percent()) secstruct.append(X.secondary_structure_fraction()) # These features throw Key & Value Errors due to non standard amino acids # (i.e. out of the 20 standard ones) e.g. X, U etc try: gravy.append(X.gravy()) molweight.append(X.molecular_weight()) instidx.append(X.instability_index()) flex.append(X.flexibility()) hydrophob.append(X.protein_scale(ProtParamData.kd, 9, 0.4)) hydrophil.append(X.protein_scale(ProtParamData.hw, 9, 0.4)) surface.append(X.protein_scale(ProtParamData.em, 9, 0.4)) except (KeyError,ValueError): gravy.append(0) molweight.append(0)
"," + str(mol_w) + "," + str(ins) + "," + str(cnt) + "\n") else: with open(path_ + "\\data\\output\\svm_out.txt", "a+") as s: s.write("-1 " + ' '.join("{}:{}".format(k, v) for k, v in a.items()) + "\n") with open(pth + "weka_output.arff", "a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values())) + " loc\n") with open(pth + "tain_DL.csv", "a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values())) + str(round(aromat, 3)) + "," + str(round(fraction[0], 3)) + "," + str(round(fraction[1], 3)) + "," + str(round(fraction[2], 3)) + "," + str(round(iso, 3)) + "," + str(mol_w) + "," + str(ins) + "," + "0" + "\n") for seq, cl in zip(seq_list, cls_list): # main loop to extract the features _ = ProteinAnalysis(seq) # Biopython protein analysis package aa_count = (_.count_amino_acids()) # amino acid count aromat, fraction, iso = _.aromaticity(), _.secondary_structure_fraction( ), _.isoelectric_point() try: mol_w, ins = ("%0.2f" % _.molecular_weight()), ("%0.2f" % _.instability_index()) except Exception: mol_w, ins = mol_w, ins # aromaticity, sec_strucure_fraction, iso_electric point , molecular weight, instability index format_output(aa_count, cl)