def find_composition(df_original): df_copy = df_original.copy() column_names = [] for ch in codes: column_names.append(ch + '_percent') column_names.append(ch + '_percent_first') column_names.append(ch + '_percent_last') column_names.append('len') column_names.append('weight') column_names.append('gravy') column_names.append('flex_mean') column_names.append('flex_std') column_names.append('ss_helix') column_names.append('ss_turn') column_names.append('ss_sheet') column_names.append('iep') column_names.append('aromaticity') df = pd.DataFrame(columns=column_names) for _, seq in enumerate(tqdm(df_copy['seq'])): df_temp = pd.Series() sequence = str(seq) analysed = ProteinAnalysis(sequence) analysed_first = ProteinAnalysis(sequence[:first_n]) analysed_last = ProteinAnalysis(sequence[-last_n:]) df_temp['len'] = analysed.length df_temp['ss_helix'], df_temp['ss_turn'], df_temp['ss_sheet'] = analysed.secondary_structure_fraction() df_temp['iep'] = analysed.isoelectric_point() # overall for aa, percent in analysed.get_amino_acids_percent().items(): df_temp[aa + '_percent'] = percent # # first N for aa, percent in analysed_first.get_amino_acids_percent().items(): df_temp[aa + '_percent_first'] = percent # last N for aa, percent in analysed_last.get_amino_acids_percent().items(): df_temp[aa + '_percent_last'] = percent df_temp['weight'] = analysed.molecular_weight() df_temp['gravy'] = analysed.gravy() df_temp['aromaticity'] = analysed.aromaticity() df_temp['flex_mean'] = np.mean(analysed.flexibility()) df_temp['flex_std'] = np.std(analysed.flexibility()) df = df.append(df_temp, ignore_index=True) return pd.concat([df_copy, df], axis=1)
def seq_properties(file_path): """Apply protein analysis on a fasta file to get analyzed amino acid profile Args: file_path [str]: File directory for the fasta file Returns: total_percent_dict [dict]: Amino acid with counts dict """ record = SeqIO.read(file_path, 'fasta') analyzed_seq = ProteinAnalysis(str(record.seq)) c = analyzed_seq.get_amino_acids_percent() acidic_percent = count_prop(c, acidic_aa) basic_percent = count_prop(c, basic_aa) hydroxylic_percent = count_prop(c, hydroxylic_aa) amidic_percent = count_prop(c, amidic_aa) aliphatic_percent = count_prop(c, aliphatic_aa) aromatic_percent = count_prop(c, aromatic_aa) total_percent_dict = { "Acidic": acidic_percent, "Basic": basic_percent, "Hydroxilic": hydroxylic_percent, "Amidic": amidic_percent, "Aliphatic": aliphatic_percent, "Aromatic": aromatic_percent } return total_percent_dict
def calculate_residue_features(temp_dict, sequence): analyzed_seq = ProteinAnalysis(sequence) aa_percent = analyzed_seq.get_amino_acids_percent() hydrophobicity = 0 hydrophilicity = 0 interior__surface_transfer_energy_scale = 0 surface_fractional_probability = 0 for key in aa_percent.keys(): hydrophobicity += aa_percent[key] * kd[key] hydrophilicity += aa_percent[key] * hw[key] surface_fractional_probability += aa_percent[key] * em[key] interior__surface_transfer_energy_scale += aa_percent[key] * ja[key] temp_dict.update({ "Hydrophobicity": hydrophobicity, "Hydrophilicity": hydrophilicity, "Surface Fractional Probability": surface_fractional_probability, "I2S Transfer Energy Scale": interior__surface_transfer_energy_scale }) temp_dict.update(aa_percent)
def protParam(seq): params = ProteinAnalysis(seq) mw = params.molecular_weight() c_aa = params.count_amino_acids() p_aa = params.get_amino_acids_percent() gravy = params.gravy() aromaticity = params.aromaticity() isoelectric_point = params.isoelectric_point() ext_coeff = sum([c_aa["W"]*5690,c_aa["Y"]*1280,c_aa["C"]*120]) mgml = ext_coeff * (1./mw) print("Amino acid count") pprint.pprint(c_aa) print("Amino acid percent") pprint.pprint(p_aa) print("Molecular weight") print("%f Da"%mw) print("Gravy") print(gravy) print("Isoelectric point") print(isoelectric_point) print("Aromaticity") print(aromaticity) print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)"%ext_coeff) print("")
def parse_pro_sequence(self, p_seq, id=None, desc=None): try: p_seq = ''.join([pro for pro in p_seq if pro in proteins]) # append fasta sequence metadata self.id.append(id) self.description.append(desc) # reverse translate protein to nucleotide sequence n_seq = ''.join([list(dna_codons.keys())[list(dna_codons.values()).index(pro)] for pro in p_seq]) self.nucleotide_sequence.append(n_seq) self.protein_sequence.append(p_seq) # self.protein_sequence.append(str(record.seq.translate()).replace('*', ' ')) # GC content self.gc_content.append(self.calculate_gc_content(n_seq)) # protein analysis methods analysis = ProteinAnalysis(p_seq) self.amino_acid_dict.append(analysis.get_amino_acids_percent()) self.molecular_weight.append(analysis.molecular_weight()) self.instability_index.append(analysis.instability_index()) self.aromaticity.append(analysis.aromaticity()) except Exception as e: print('-'*80) print(f"Exception in parsing uploaded virus sequence: {e}") traceback.print_exc(file=sys.stdout) print('-'*80)
def analyze(seq, name): analysed = ProteinAnalysis(seq) print(name) print("pI: ") print(analysed.isoelectric_point()) print("AA percent: ") print(analysed.get_amino_acids_percent())
def protein_analysis(): if session.username == None: redirect(URL(r=request, c='account', f='log_in')) from Bio.SeqUtils.ProtParam import ProteinAnalysis form = FORM( TABLE( TR( "Amino acid sequence: ", TEXTAREA(_type="text", _name="sequence", requires=IS_NOT_EMPTY())), INPUT(_type="submit", _value="SUBMIT"))) if form.accepts(request.vars, session): session['sequence'] = seqClean(form.vars.sequence.upper()) X = ProteinAnalysis(session['sequence']) session['aa_count'] = X.count_amino_acids() session['percent_aa'] = X.get_amino_acids_percent() session['mw'] = X.molecular_weight() session['aromaticity'] = X.aromaticity() session['instability'] = X.instability_index() session['flexibility'] = X.flexibility() session['pI'] = X.isoelectric_point() session['sec_struct'] = X.secondary_structure_fraction() redirect(URL(r=request, f='protein_analysis_output')) return dict(form=form)
def prot_param_features(seq): features = {} pa = ProteinAnalysis(str(seq.seq)) # .replace('X','G').replace('B','A') # 1. Amino Acid Percent aa = pa.get_amino_acids_percent() aa_dict = {"frac_{}".format(k): v for k, v in aa.items()} features.update(aa_dict) # 2. Aromaticity features["aromaticity"] = pa.aromaticity() # 3. Isoelectric Point features["isoelectric"] = pa.isoelectric_point() # 4. Molecular Weight try: features["mol_weight"] = pa.molecular_weight() except ValueError: replaced = str(seq.seq).replace('X', 'G').replace('B', 'N') # 5. Flexibility # try: # features["flexibility"] = np.mean(pa.flexibility()) # except KeyError: # replaced = str(seq.seq).replace('X', 'G').replace('B', 'N').replace('U','C') # features["flexibility"] = np.mean(ProteinAnalysis(replaced).flexibility()) # 6. Secondary Structure Fraction struc = ["struc_helix", "struc_turn", "struc_sheet"] ss = pa.secondary_structure_fraction() features.update(dict(zip(struc, ss))) return features
def transform(self, X): vec = np.zeros((len(X), len(VALID_AMINO_ACIDS))) for i in range(len(X)): pa = ProteinAnalysis(str(X[i])) for j, a in enumerate(VALID_AMINO_ACIDS): vec[i, j] = pa.get_amino_acids_percent().get(a, 0.0) return vec
def prot_feats_seq(seq): aa=['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] f=[] X = ProteinAnalysis(str(seq)) X.molecular_weight() #throws an error if 'X' in sequence. we skip such sequences p=X.get_amino_acids_percent() dp=[] for a in aa: dp.append(p[a]) dp=np.array(dp) dp=normalize(np.atleast_2d(dp), norm='l2', copy=True, axis=1, return_norm=False) f.extend(dp[0]) tm=np.array(twomerFromSeq(str(seq))) tm=normalize(np.atleast_2d(tm), norm='l2', copy=True, axis=1,return_norm=False) f.extend(tm[0]) thm=np.array(threemerFromSeq(str(seq))) thm=normalize(np.atleast_2d(thm), norm='l2', copy=True, axis=1,return_norm=False) f.extend(thm[0]) return np.array(f)
def normal_charge_properties(self): df = pd.read_csv(self.train_fpi, sep='\t', index_col=0) df = df[df['y'] == 0] seqs = list(df['Sequence']) all_deltas = [] net_charges = [] frac_charges = [] all_seq_in = '' for seq in seqs: ms = motif_seq.LcSeq(seq, self.k, self.lca, 'lca') in_seq, out_seq = ms.seq_in_motif() in_kmer, out_kmer = ms.overlapping_kmer_in_motif() if len(in_kmer) > 20: ka = kappa.KappaKmers(out_kmer, out_seq) delta = ka.deltaForm() if ka.NCPR() > -0.1 and ka.NCPR() < 0.1: if delta < 0.1: ns = norm_score.NormScore() score = ns.lc_norm_score([seq])[0] if score > 20: if ka.FCR() < 0.2: all_seq_in += in_seq analysed_seq = ProteinAnalysis(all_seq_in) aa_perc = analysed_seq.get_amino_acids_percent() print(aa_perc)
def protParam(seq): params = ProteinAnalysis(seq) mw = params.molecular_weight() c_aa = params.count_amino_acids() p_aa = params.get_amino_acids_percent() gravy = params.gravy() aromaticity = params.aromaticity() isoelectric_point = params.isoelectric_point() ext_coeff = sum([c_aa["W"] * 5690, c_aa["Y"] * 1280, c_aa["C"] * 120]) mgml = ext_coeff * (1. / mw) print("Amino acid count") pprint.pprint(c_aa) print("Amino acid percent") pprint.pprint(p_aa) print("Molecular weight") print("%f Da" % mw) print("Gravy") print(gravy) print("Isoelectric point") print(isoelectric_point) print("Aromaticity") print(aromaticity) print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)" % ext_coeff) print("")
def calculate_physiochemical_features(temp_dict, sequence): analyzed_seq = ProteinAnalysis(sequence) charge_at_pH7 = analyzed_seq.charge_at_pH(7) instability_index = analyzed_seq.instability_index() molecular_weight = analyzed_seq.molecular_weight() aromaticity = analyzed_seq.aromaticity() molar_extinction_coefficient = analyzed_seq.molar_extinction_coefficient() range_l, range_h = molar_extinction_coefficient molar_extinction_coefficient = (float(range_l) + float(range_h)) / 2 gravy = analyzed_seq.gravy( ) #Grand Average Hyrdopathy - Higher value = More Hydrophobic isoelectric_point = analyzed_seq.isoelectric_point() helix_fraction, turn_fraction, sheet_fraction = analyzed_seq.secondary_structure_fraction( ) physiochem_dict = { "Charge at pH7": charge_at_pH7, "Instability Index": instability_index, "Molecular Wt": molecular_weight, "Aromaticity": aromaticity, "Molar Extinction Coeff": molar_extinction_coefficient, "Gravy": gravy, "Isoelectric pt": isoelectric_point, "Helix Fraction": helix_fraction, "Turn Fraction": turn_fraction, "Sheet Fraction": sheet_fraction } temp_dict.update(physiochem_dict) #Adding separately because get_amino_acids_percent() generates a dictionary on its own aa_percent = analyzed_seq.get_amino_acids_percent() temp_dict.update(aa_percent)
def percentages_from_proteins(path): file=open(path) names_list=[] sequence_list=[] sources_list = [] desc_list = [] taxo_list = [] keyw_list = [] taxid_list = [] for record in parse(file, "genbank"): cdsnum=0 for feat in record.features: prot=record.seq analysed_seq = ProteinAnalysis(str(prot)) #creating another class ProteinAnalysis sequence_list.append(analysed_seq.get_amino_acids_percent()) #invoking method on this class, it returns a dictionary, we store it in the list names_list.append(str(record.name)+ "_CDS#" + str(cdsnum)) sources_list.append(record.annotations['source']) keyw_list.append(record.annotations['keywords']) taxo_list.append(record.annotations['taxonomy']) desc_list.append(record.description) taxid_list.append(record.annotations["organism"]) cdsnum+=1 #List of dictionaties to the numpy array aas = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] nseqs = len(sequence_list) percents=np.zeros((nseqs,20)) for i in range(nseqs): percdict = sequence_list[i] for an in range(20): percents[i,an]= percdict[ aas[an] ] return percents, names_list, sources_list, desc_list, taxo_list, keyw_list, taxid_list, sequence_list
def get_aa_bins(self, seq): aas = 'SGEQAPDTNKRLHVYFIMCW' pa = ProteinAnalysis(seq) bc_dict = pa.get_amino_acids_percent() aa_bins = [] for aa in aas: aa_bins.append(bc_dict[aa]) return aa_bins
def get_aa_percentage_vectors(X): res = np.zeros((X.shape[0], 20)) for i, seq in enumerate(X): analysed_seq = ProteinAnalysis(seq) res[i] = pd.Series(analysed_seq.get_amino_acids_percent())[ aas # to ensure the same order every time just in case ].values return res
def protein_stats(proteome_file): seq = "" for record in SeqIO.parse(proteome_file, "fasta"): sequences = record.seq seq = seq+str(sequences) s = ProteinAnalysis(str(seq)) dic = s.get_amino_acids_percent() for k, v in sorted(dic.items()): print(k, dic[k])
def amino_acid_composition_last50(self, record): ''' Input: - record: a SeqRecord Output: - dictionary: representing the distribution of amino acids over the last 50 amino acids ''' PA = ProteinAnalysis(str(record.seq)[-50:]) return PA.get_amino_acids_percent()
def amino_acid_composition(self, record): ''' Input: - record: a SeqRecord Output: - dictionary: representing the distribution of amino acids in the sequence ''' PA = ProteinAnalysis(str(record.seq)) return PA.get_amino_acids_percent()
def _one_organism(self, fasta_in): all_aa = '' fasta_in = os.path.join(self.seg_dpi, fasta_in) with open(fasta_in, 'r') as fasta_in: for record in SeqIO.parse(fasta_in, 'fasta'): sequence = str(record.seq) for aa in sequence: if aa.islower(): all_aa += aa analyzed_sequence = ProteinAnalysis(all_aa) return analyzed_sequence.get_amino_acids_percent()
def aa_frequency(outfile): fasta_sequences = SeqIO.parse(open(outfile),'fasta') all_seq="" for record in fasta_sequences: name, sequence = record.id, record.seq #x=ProteinAnalysis(str(record.seq)) #print(record.id, x.count_amino_acids()) all_seq=all_seq+str(sequence) #print(all_seq) y=ProteinAnalysis(str(all_seq)) print("all_seq_n", y.count_amino_acids()) print("all_seq_%", y.get_amino_acids_percent())
def make_dataset(fasta): # a list of dictionaries containing features for all sequences ls_features = [] # assign whether it's from tardigrades 'tar' or poplars 'pop' if 'tar' in fasta: target = 0 elif 'pop' in fasta: target = 1 for record in SeqIO.parse(fasta, "fasta"): analysed_seq = ProteinAnalysis(str(record.seq)) # the dictionary containing features for a single sequence dict_features = {} # compute length dict_features['length'] = len(record.seq) # compute molecular weight dict_features['mol_weight'] = analysed_seq.molecular_weight() # compute aromaticity dict_features['aromaticity'] = analysed_seq.molecular_weight() # compute stability dict_features['stability'] = analysed_seq.instability_index() # compute flexibility dict_features['flexibility'] = analysed_seq.flexibility() # compute isoelectric point dict_features['isoelectric'] = analysed_seq.isoelectric_point() # compute secondary structure fraction frac = analysed_seq.secondary_structure_fraction() dict_features['helix'] = frac[0] dict_features['turn'] = frac[1] dict_features['sheet'] = frac[2] # compute AAC composition of entire sequence aac = analysed_seq.get_amino_acids_percent() # merge all features and dictionaries into dict_features dict_features.update(aac) ls_features += [dict_features] df = pd.DataFrame(ls_features) df['target'] = target print(df) df.to_pickle(name + '_set.pkl')
def write_aa_comp(self): cols = ['Protein ID', 'y'] + [aa for aa in self.aas] df_train = pd.read_csv(self.train_fp, sep='\t', index_col=0) bc_seqs = list(df_train[df_train['y'] == 0]['Sequence']) pdb_seqs = list(df_train[df_train['y'] == 1]['Sequence']) df_dict = dict() for aa in self.aas: df_dict[aa] = [] df_dict['y'] = list(df_train['y']) df_dict['Protein ID'] = list(df_train['Protein ID']) for bc_seq in bc_seqs: a_bc_seq = ProteinAnalysis(bc_seq) bc_aas = a_bc_seq.get_amino_acids_percent() for aa in self.aas: df_dict[aa].append(bc_aas[aa]) for pdb_seq in pdb_seqs: a_pdb_seq = ProteinAnalysis(pdb_seq) pdb_aas = a_pdb_seq.get_amino_acids_percent() for aa in self.aas: df_dict[aa].append(pdb_aas[aa]) df = pd.DataFrame(df_dict, columns=cols) df.to_csv(self.comp_fp, sep='\t')
def biopython_protein_analysis(inseq): """Utiize Biopython's ProteinAnalysis module to return general sequence properties of an amino acid string. For full definitions see: http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam.ProteinAnalysis-class.html Args: inseq: Amino acid sequence Returns: dict: Dictionary of sequence properties. Some definitions include: instability_index: Any value above 40 means the protein is unstable (has a short half life). secondary_structure_fraction: Percentage of protein in helix, turn or sheet TODO: Finish definitions of dictionary """ inseq = ssbio.protein.sequence.utils.cast_to_str(inseq) analysed_seq = ProteinAnalysis(inseq) info_dict = {} info_dict['amino_acids_content-biop'] = analysed_seq.count_amino_acids() info_dict[ 'amino_acids_percent-biop'] = analysed_seq.get_amino_acids_percent() info_dict['length-biop'] = analysed_seq.length info_dict['monoisotopic-biop'] = analysed_seq.monoisotopic info_dict['molecular_weight-biop'] = analysed_seq.molecular_weight() info_dict['aromaticity-biop'] = analysed_seq.aromaticity() info_dict['instability_index-biop'] = analysed_seq.instability_index() # TODO: What is flexibility? info_dict['flexibility-biop'] = analysed_seq.flexibility() info_dict['isoelectric_point-biop'] = analysed_seq.isoelectric_point() # grand average of hydrophobicity info_dict['gravy-biop'] = analysed_seq.gravy() # Separated secondary_structure_fraction into each definition # info_dict['secondary_structure_fraction-biop'] = analysed_seq.secondary_structure_fraction() info_dict[ 'percent_helix_naive-biop'] = analysed_seq.secondary_structure_fraction( )[0] info_dict[ 'percent_turn_naive-biop'] = analysed_seq.secondary_structure_fraction( )[1] info_dict[ 'percent_strand_naive-biop'] = analysed_seq.secondary_structure_fraction( )[2] return info_dict
def plot_comp(self): df_train = pd.read_csv(self.train_fp, sep='\t', index_col=0) bc_seqs = list(df_train[df_train['y'] == 0]['Sequence']) pdb_seqs = list(df_train[df_train['y'] == 1]['Sequence']) aas_list = [aa for aa in self.aas] ind = range(len(self.aas)) pdb_seq = '' for seq in pdb_seqs: pdb_seq += seq cb_seq = '' for seq in bc_seqs: cb_seq += seq an_pdb_seq = ProteinAnalysis(pdb_seq) pdb_dict = an_pdb_seq.get_amino_acids_percent() an_cb_seq = ProteinAnalysis(cb_seq) cb_dict = an_cb_seq.get_amino_acids_percent() pdb_bins = [] cb_bins = [] for aa in aas_list: pdb_bins.append(pdb_dict[aa]) cb_bins.append(cb_dict[aa]) plt.bar(ind, pdb_bins, color='darkblue', alpha=0.7, label='PDB', align='center') plt.bar(ind, cb_bins, color='orangered', alpha=0.7, label='BC', align='center') plt.xticks(ind, aas_list) plt.xlim([-1, len(self.aas)]) plt.legend() plt.xlabel('Amino Acids', size=12) plt.ylabel('Relative Fraction', size=12)
def solubility_rules(self): """ Function to calculate some solubility rules based on recommendations of http://bioserv.rpbs.univ-paris-diderot.fr/services/SolyPep/ Output: solubility_rules_failed - return the number of rules faild based on the criteria """ # Rule N1. Number of hydrophobic or charged residues hydro_residues = ['V', 'I', 'L', 'M', 'F', 'W', 'C'] charged_residues = ['H', 'R', 'K', 'D', 'E'] count_hydro_charged = 0 for aa in self.sequence: if aa in hydro_residues or aa in charged_residues: count_hydro_charged += 1 # This condition should change depending on the sequence length hydro_char_threshold = float(self.length_peptide) * 0.45 if count_hydro_charged > hydro_char_threshold: self.solubility_rules_failed += 1 # Rule N2. Computed peptide charge charge_threshold = 1 self.compute_peptide_charges() if self.netCharge > 1: self.solubility_rules_failed += 1 # Rule N3. Glycine or Proline content in the sequence count_gly_pro = 0 for aa in self.sequence: if aa == "G" or aa == "P": count_gly_pro += 1 # Check threshold if count_gly_pro > 1: self.solubility_rules_failed += 1 # Rule N4. First or last amino acid charged count_charge = 0 if self.sequence[0] in charged_residues: count_charge += 1 if self.sequence[-1] in charged_residues: count_charge += 1 # Check threshold if count_charge > 0: self.solubility_rules_failed += 1 # Rule N5. Any amino acid represent more than 25% of the total sequence prot_parameters = ProteinAnalysis(self.sequence) aa_content = prot_parameters.get_amino_acids_percent() for aa in aa_content: if aa_content[aa] >= 0.3: self.solubility_rules_failed += 1 break
def aminoAcidComposition(path): dicoProt = pk.load(open(path, "rb")) hist = dict() for k, v in dicoProt.items(): hist_temp = ProteinAnalysis.get_amino_acids_percent( ProteinAnalysis(str(v))) for key in hist_temp.keys(): if (key in hist): hist[key] = hist[key] + 100 * hist_temp[key] / (float)( len(dicoProt)) else: hist[key] = 100 * hist_temp[key] / (float)(len(dicoProt)) return hist
def __init__(self, sequence): self.sequence = sequence self.sequence_length = len(sequence) analysis = ProteinAnalysis(sequence) self.amino_acid_percents = analysis.get_amino_acids_percent() self.amino_acids_composition = calculate_amino_acids_composition(sequence) self.aromaticity = analysis.aromaticity() self.instability = analysis.instability_index() self.flexibility = calculate_flexibility(sequence) protein_scale_parameters = [{'name': 'Hydrophilicity', 'dictionary': hw}, {'name': 'Surface accessibility', 'dictionary': em}, {'name': 'Janin Interior to surface transfer energy scale', 'dictionary': ja}, {'name': 'Bulkiness', 'dictionary': bulkiness}, {'name': 'Polarity', 'dictionary': polarity}, {'name': 'Buried residues', 'dictionary': buried_residues}, {'name': 'Average area buried', 'dictionary': average_area_buried}, {'name': 'Retention time', 'dictionary': retention_time}] self.protein_scales = calculate_protein_scales(analysis, protein_scale_parameters) self.isoelectric_point = analysis.isoelectric_point() self.secondary_structure_fraction = calculate_secondary_structure_fraction(analysis) self.molecular_weight = analysis.molecular_weight() self.kyte_plot = analysis.gravy() self.pefing = calculate_pefing(sequence) # next parameters are calculated using R.Peptides r('require(Peptides)') r('sequence = "{0}"'.format(sequence)) self.aliphatic_index = r('aindex(sequence)')[0] self.boman_index = r('boman(sequence)')[0] self.charges = calculate_charges(sequence, 1.0, 14.0, 0.5, 'Lehninger') self.hydrophobicity = r('seq(sequence)')[0] angles = [{'name': 'Alpha-helix', 'angle': -47}, {'name': '3-10-helix', 'angle': -26}, {'name': 'Pi-helix', 'angle': -80}, {'name': 'Omega', 'angle': 180}, {'name': 'Antiparallel beta-sheet', 'angle': 135}, {'name': 'Parallel beta-sheet', 'angle': 113}] if self.amino_acid_percents['P'] + self.amino_acid_percents['G'] > 0.3: angles.append({'name': 'Polygly-polypro helix', 'angle': 153}) self.hydrophobic_moments = calculate_hydrophobic_moments(sequence, angles) self.kidera_factors = calculate_kidera_factors(sequence) self.peptide_types = calculate_peptide_types(sequence, angles)
def calculate_properties_from_sequence(self): """ Function to calculate some molecular properties based on RDKit functionalities Arguments: Sequence - amino acid sequence of the peptide Return: Average Eisenberg hydrophobicity ProtParam parameters: Isolectric point, aromaticity, instability index, amino acid percentage """ # Hydrophobicity -> Eisenberg scale hydrophobicity = { 'A': 0.620, 'R': -2.530, 'N': -0.780, 'D': -0.900, 'C': 0.290, 'Q': -0.850, 'E': -0.740, 'G': 0.480, 'H': -0.400, 'Y': 0.260, 'I': 1.380, 'L': 1.060, 'K': -1.500, 'M': 0.640, 'F': 1.190, 'P': 0.120, 'S': -0.180, 'T': -0.050, 'W': 0.810, 'V': 1.080 } self.avg_hydro = sum([hydrophobicity[resi] for resi in self.sequence]) # ProParam properties prot_parameters = ProteinAnalysis(self.sequence) self.aromaticity = prot_parameters.aromaticity() self.aa_percent = prot_parameters.get_amino_acids_percent() self.instability_index = prot_parameters.instability_index() self.isoelectric_point = prot_parameters.isoelectric_point()
def amino_acid_analysis(self): """ Adds fraction of amino acid residues (defined in RESIDUES) to data frame. """ for res in RESIDUES: self.df["fraction_" + res] = ( self.df["sequence"].str.count(res) / self.df["sequence"].str.len() ) self.df["length"] = self.df["sequence"].str.len() for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0]): # for index, row in self.df.iterrows(): seq = row["sequence"] seqanalysis = ProteinAnalysis(seq) acidist = seqanalysis.get_amino_acids_percent() self.df.loc[index, "IEP"] = seqanalysis.isoelectric_point() if "X" not in seq and "B" not in seq: self.df.loc[index, "molecular_weight"] = seqanalysis.molecular_weight() if "U" not in seq and "X" not in seq and "B" not in seq: self.df.loc[index, "gravy"] = seqanalysis.gravy()
def bio_feat(record): clean_seq = str(MutableSeq(record.seq)).replace("X", "") clean_seq = clean_seq.replace("U", "C") clean_seq = clean_seq.replace("B", "N") clean_seq = clean_seq.replace('Z', 'Q') clean_seq = MutableSeq(clean_seq).toseq() ### features seq_length = len(str(clean_seq)) analysed_seq = ProteinAnalysis(str(clean_seq)) molecular_weight = analysed_seq.molecular_weight() amino_percent = analysed_seq.get_amino_acids_percent().values() isoelectric_points = analysed_seq.isoelectric_point() count = analysed_seq.count_amino_acids().values() # aromaticity = analysed_seq.aromaticity() instability_index = analysed_seq.instability_index() # hydrophobicity = analysed_seq.protein_scale(ProtParamData.kd, 5, 0.4) secondary_structure_fraction = analysed_seq.secondary_structure_fraction() return np.array([seq_length, molecular_weight, isoelectric_points, instability_index] + list(secondary_structure_fraction) + list(count) + list(amino_percent))
def is_string_aminoacids1(item): output = False if type(item) is str: if item.startswith('aminoacids1:'): output = True else: from ..string_aminoacids3 import is_string_aminoacids3 if not is_string_aminoacids3(item): from Bio.SeqUtils.ProtParam import ProteinAnalysis analysed_seq = ProteinAnalysis(item) output = (sum(analysed_seq.get_amino_acids_percent().values()) > 0.99) return output
def protein_analysis(): if session.username == None: redirect(URL(r=request,f='../account/log_in')) from Bio.SeqUtils.ProtParam import ProteinAnalysis form = FORM(TABLE( TR("Amino acid sequence: ", TEXTAREA(_type="text", _name="sequence", requires=IS_NOT_EMPTY())), INPUT(_type="submit", _value="SUBMIT"))) if form.accepts(request.vars,session): session['sequence'] = seqClean(form.vars.sequence.upper()) X = ProteinAnalysis(session['sequence']) session['aa_count'] = X.count_amino_acids() session['percent_aa'] = X.get_amino_acids_percent() session['mw'] = X.molecular_weight() session['aromaticity'] = X.aromaticity() session['instability'] = X.instability_index() session['flexibility'] = X.flexibility() session['pI'] = X.isoelectric_point() session['sec_struct'] = X.secondary_structure_fraction() redirect(URL(r=request, f='protein_analysis_output')) return dict(form=form)
class amp: "stores all data of peptide" def __init__(self,readed): self.seq = readed[1] self.length = len(readed[1]) self.name = readed[0] def netcharge(self): #i don't thonk biopython calculates net charge self.pos = 'KRH' self.neg = 'DE' self.net = 0 self.posRe = 0 for i in self.seq: if i in self.pos: self.net += 1 # no self.posRe += 1 #need it for searching if i in self.neg: self.net -= 1 else: continue def hphobFract(self): #i don't know if biopython calculates just froaction of hphobs hph = 'ACFGILMPV' self.hpf = 0. for i in self.seq: if i in hph: self.hpf += 1 else: continue self.hpn = self.hpf self.hpf = self.hpf/self.length def analyzeAMP(self): from Bio.SeqUtils.ProtParam import ProteinAnalysis self.netcharge() self.hphobFract() #self.aaPerc = self.pepParam.get_amino_acids_percent() self.pepParam = ProteinAnalysis(self.seq) self.data = {'charge': self.net, 'length': self.length, 'hydrophobic':self.hpf, 'aminoacids': self.pepParam.get_amino_acids_percent()} return self.data def detectAMP(self): from Bio.SeqUtils.ProtParam import ProteinAnalysis import re import ConfigParser import numpy as np parser = ConfigParser.SafeConfigParser() parser.read('config.ini') "floating window and search for values" lowNet = parser.getfloat('Parameters','lowNet') #0 midNet = parser.getfloat('Parameters','midNet')#2 highNet = parser.getfloat('Parameters','highNet')#6 lowHpf = parser.getfloat('Parameters','lowHpf')#0.5 highHpf = parser.getfloat('Parameters','highHpf')#0.9 lowCompCoeff = parser.getfloat('Parameters','lowCompCoeff')#0.85 highCompCoeff = parser.getfloat('Parameters','highCompCoeff')#1.5 baseWind = parser.getint('Parameters','baseWind')#15 # maxWind = parser.getfloat('Parameters','maxWind')#100 thresh = parser.getint('Parameters','thresh')#6 minLen = parser.getint('Parameters','minLen')#10 # C R W H K D E baseCompose = [0.01,0.06,0.005,0.02,0.06,0.05,0.07] ampCompose = [0.06,0.09,0.01, 0.02,0.1, 0.02,0.03] changes = [i[1]/i[0] for i in zip(baseCompose,ampCompose)] upAvg = np.average(changes[:-2]) downAvg = np.average(changes[-2:]) self.result = [0 for i in self.seq] if self.length > baseWind*2: for i in range(self.length-baseWind): self.subPep = amp(['subPep',self.seq[i:i+baseWind]]) self.subPep.netcharge() self.subPep.hphobFract() #print self.subPep.net, self.subPep.hpf, i, i+baseWind self.pepParam = ProteinAnalysis(self.subPep.seq) self.aaPerc = self.pepParam.get_amino_acids_percent() self.subPepComp = [self.aaPerc[aminame] for aminame in ['C','R','W','H','K','D','E']] self.subPepChanges = [k[1]/k[0] for k in zip(baseCompose,self.subPepComp)] self.upSubAvg = np.average(self.subPepChanges[:-2]) self.downSubAvg = np.average(self.subPepChanges[-2:]) #really #really hate such muliticondidtional #print downAvg,',,,,,',self.downSubAvg if (((lowNet < self.subPep.net < highNet and\ self.subPep.hpf > lowHpf) or\ (midNet < self.subPep.net ) or \ (self.subPep.hpf > highHpf)) and\ self.upSubAvg > lowCompCoeff*upAvg) or\ self.upSubAvg > highCompCoeff*upAvg: for aa in range(i,i+baseWind): self.result[aa] += 1 else: continue else: self.subPep = self self.subPep.netcharge() self.subPep.hphobFract() self.pepParam = ProteinAnalysis(self.subPep.seq) self.aaPerc = self.pepParam.get_amino_acids_percent() self.subPepComp = [self.aaPerc[aminame] for aminame in ['C','R','W','H','K','D','E']] self.subPepChanges = [k[1]/k[0] for k in zip(baseCompose,self.subPepComp)] self.upSubAvg = np.average(self.subPepChanges[:-2]) self.downSubAvg = np.average(self.subPepChanges[-2:]) #print downAvg,',,,,,',self.downSubAvg if ((lowNet < self.subPep.net < highNet and\ self.subPep.hpf > lowHpf) or\ (midNet < self.subPep.net) or \ (self.subPep.hpf > lowCompCoeff*upAvg)) and\ self.upSubAvg > highCompCoeff*upAvg: self.result = [i+1 for i in self.result] else: pass self.thrRes = [] for val in self.result: if val > thresh: self.thrRes.append(1) else: self.thrRes.append(0) self.strRes= ''.join([str(i) for i in self.thrRes]) self.matches = re.split('0*',self.strRes) #for match in self.matches: self.matches = [match for match in self.matches if len(match) > minLen] if len(self.matches) > 0: # print 'found peptide of length ',len(self.matches[0]) return 'found peptide of length ' + str(len(self.matches[0])) else: return 'nothing found' def plotPred(self): import matplotlib.pylab as pl try: checker = self.result[0] except: self.detectAMP() pl.plot(self.thrRes,'.-') pl.savefig('testy.pdf')
from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.SeqUtils import ProtParamData import sys import json inp = json.loads(sys.argv[1]) seq = inp["Sequence"] X = ProteinAnalysis(seq) data = dict() if "MW" in inp["Options"]: data["MW"] = X.molecular_weight() if "EC280" in inp["Options"]: aa_count = X.count_amino_acids() if "hasDisulfide" in inp["Options"]: data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"] + 62.5 * aa_count["C"] else: data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"] if "PI" in inp["Options"]: data["PI"] = X.isoelectric_point() if "AACont" in inp["Options"]: ratios = X.get_amino_acids_percent() data["AACont"] = {aa: ratios[aa] * 100. for aa in ratios} print json.dumps(data)
protein_name = get_protein_name(line) protein_names_and_segments[protein_name] = get_segments(line) protein_names_and_sequences[protein_name] = '' else: sequence = protein_names_and_sequences.get(protein_name) sequence += line.strip('\n' and '\r' and '\r\n') protein_names_and_sequences[protein_name] = sequence for key in protein_names_and_segments.keys(): for segment in protein_names_and_segments.get(key): segment_sequence = protein_names_and_sequences.get(key)[segment[0] - 1:segment[1]] x += segment_sequence y = ProteinAnalysis(str(x)) z = y.get_amino_acids_percent() # visual for command line print 'parsing ' + FILE_INPUT + '\n' # build the output file as CSV with open('percent_AA_per_seg_OUTPUT.csv', 'wb') as f: w = csv.writer(f) w.writerows(z.items()) # opens the ouput file file = '/Users/simonkeng/senior-research-project/percent_AA_per_seg_OUTPUT.csv' open_file(file)
from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.SeqUtils import ProtParamData from Bio import SeqIO with open('../../samples/pdbaa') as fh: for rec in SeqIO.parse(fh,'fasta'): myprot = ProteinAnalysis(str(rec.seq)) print(myprot.count_amino_acids()) print(myprot.get_amino_acids_percent()) print(myprot.molecular_weight()) print(myprot.aromaticity()) print(myprot.instability_index()) print(myprot.flexibility()) print(myprot.isoelectric_point()) print(myprot.secondary_structure_fraction()) print(myprot.protein_scale(ProtParamData.kd, 9, .4))