def protAnalysis(self, content): result, resultFlexDic = dict(), dict() content = Parsers.normalizeSequence(content, self.sourceType) protein = ProteinAnalysis(content) result['proteinMWeight'] = protein.molecular_weight() result['proteinAroma'] = protein.aromaticity() result['proteinInstab'] = protein.instability_index() result['proteinIsoelec'] = protein.isoelectric_point() result['proteinGravy'] = protein.gravy() proteinStructure = protein.secondary_structure_fraction() protStruct = self.flatten('proteinSecstruc', proteinStructure) result = {**protStruct, **result} # merge result and protein Structure flexibility = protein.flexibility() flexibFlat = self.flatten('proteinFlex', flexibility) flexibAmino = self.flatten(list(content), flexibility) flattened = {**flexibFlat, **result} flattenedFlexDic = {**flexibAmino, **result} return result, flattened, flattenedFlexDic,
def sequence_vector(temp_window: str, window: int = 6, chemical=1): """ This vector takes the sequence and has each amino acid represented by an int 0 represents nonstandard amino acids or as fluff for tails/heads of sequences Strip is a list which can be modified as user needs call for """ temp_window = clean(temp_window) temp_window = windower(sequence=temp_window, position=int(len(temp_window)*.5), wing_size=window) vec = [] aa = {"G": 1, "A": 2, "L": 3, "M": 4, "F": 5, "W": 6, "K": 7, "Q": 8, "E": 9, "S": 10, "P": 11, "V": 12, "I": 13, "C": 14, "Y": 15, "H": 16, "R": 17, "N": 18, "D": 19, "T": 20, "X": 0} for i in temp_window: vec.append(aa[i]) if len(vec) != (window*2)+1: t = len(vec) for i in range((window*2)+1-t): vec.append(0) # Hydrophobicity is optional if chemical == 1: s = ProteinAnalysis(temp_window) vec.append(s.gravy()) vec.append(s.instability_index()) vec.append(s.aromaticity()) return vec
def protein_properties(seq): """Return a tuple with some protein biochemical properties seq is a Bio.Seq.Seq or str representing protein sequence """ pa = ProteinAnalysis(seq) aa_counts = pa.count_amino_acids() arom = pa.aromaticity() isoelec = pa.isoelectric_point() try: instability = pa.instability_index() except KeyError: instability = None try: gravy = pa.gravy() except KeyError: gravy = None return ProtProp(aa=str(seq), gravy=gravy, aromaticity=arom, isoelectric_point=isoelec, instability=instability, aa_counts=aa_counts)
def featureExtraction(train_df, test_df): #feature extraction using bio library to acquire peptide attributes n = len(train_df) Y = train_df[0] train_df = train_df.drop(columns=0) train_df = train_df.rename(columns={1: 0}) big = pd.concat([train_df, test_df], ignore_index=True) big['molecular_weight'] = 0.0 #big['flexibility'] = 0 big['isoelectric_point'] = 0.0 big['aromaticity'] = 0.0 big['stability'] = 0.0 for i in range(len(big)): #print(big.iloc[i, 0]) val = big.iloc[i, 0] #invalid peptide check, set all values to 0 if 'X' in val or 'Z' in val: big.at[i, 'molecular_weight'] = -1 #big.at[i, 'flexibility'] = -1 big.at[i, 'isoelectric_point'] = -1 big.at[i, 'aromaticity'] = -1 big.at[i, 'stability'] = -1 continue model = ProteinAnalysis(val) big.at[i, 'molecular_weight'] = model.molecular_weight() #big.at[i, 'flexibility'] = model.flexibility() big.at[i, 'isoelectric_point'] = model.isoelectric_point() big.at[i, 'aromaticity'] = model.aromaticity() big.at[i, 'stability'] = model.instability_index() big = big.drop(columns=0) train_df = big.iloc[:n, ] test_df = big.iloc[n:, ] return train_df, test_df, Y
def _protein_parameters(self, sequence): """Calculates physicochemical properties for the amino acid sequence. Args: sequence: str, amino acid sequence. Returns: property_arr: np array, vector of properties. """ analysis = ProteinAnalysis(sequence) property_arr = [] property_arr.append(analysis.molecular_weight()) property_arr.append(analysis.aromaticity()) property_arr.append(analysis.instability_index()) property_arr.append(analysis.gravy()) property_arr.append(analysis.isoelectric_point()) secondary = analysis.secondary_structure_fraction() property_arr.append(secondary[0]) property_arr.append(secondary[1]) property_arr.append(secondary[2]) molar_extinction_coefficient = analysis.molar_extinction_coefficient() property_arr.append(molar_extinction_coefficient[0]) property_arr.append(molar_extinction_coefficient[1]) property_arr.append(self._net_charge(sequence)) return np.array(property_arr)
def physchem_props(data): """Calculate the physicochemical properties per protein in ara_d.""" new_table = [] header = "ID\tclass\tindex\tsequon\tsequence\tmol_weight\tgravy\taromaticity\tinstab_index\tiso_point\n" new_table.append(header) for line in data: split_line = line.rstrip().split('\t') seq = split_line[-2] # Sequon, not sequence # Calculates the properties if "X" in seq or '*' in seq or seq == '': continue # Skip non-usable sequences, only negs try: a_seq = ProteinAnalysis(seq) # Update ara_d with new physchem properties results = [ a_seq.molecular_weight(), a_seq.gravy(), a_seq.aromaticity(), a_seq.instability_index(), #a_seq.flexibility(), a_seq.isoelectric_point(), #a_seq.secondary_structure_fraction(), ] except: print(split_line) sys.exit(1) new_line = line.rstrip() + "\t{}\t{}\t{}\t{}\t{}\n".format(*results) new_table.append(new_line) return new_table
def get_biopython_features(X): res = np.zeros((X.shape[0], 6)) for i,seq in enumerate(X): analysed_seq = ProteinAnalysis(seq) res[i] = np.array([analysed_seq.molecular_weight()]+[analysed_seq.instability_index()] + [analysed_seq.isoelectric_point()] + list(analysed_seq.secondary_structure_fraction())) return res
def protein_analysis(): if session.username == None: redirect(URL(r=request, c='account', f='log_in')) from Bio.SeqUtils.ProtParam import ProteinAnalysis form = FORM( TABLE( TR( "Amino acid sequence: ", TEXTAREA(_type="text", _name="sequence", requires=IS_NOT_EMPTY())), INPUT(_type="submit", _value="SUBMIT"))) if form.accepts(request.vars, session): session['sequence'] = seqClean(form.vars.sequence.upper()) X = ProteinAnalysis(session['sequence']) session['aa_count'] = X.count_amino_acids() session['percent_aa'] = X.get_amino_acids_percent() session['mw'] = X.molecular_weight() session['aromaticity'] = X.aromaticity() session['instability'] = X.instability_index() session['flexibility'] = X.flexibility() session['pI'] = X.isoelectric_point() session['sec_struct'] = X.secondary_structure_fraction() redirect(URL(r=request, f='protein_analysis_output')) return dict(form=form)
def parse_pro_sequence(self, p_seq, id=None, desc=None): try: p_seq = ''.join([pro for pro in p_seq if pro in proteins]) # append fasta sequence metadata self.id.append(id) self.description.append(desc) # reverse translate protein to nucleotide sequence n_seq = ''.join([list(dna_codons.keys())[list(dna_codons.values()).index(pro)] for pro in p_seq]) self.nucleotide_sequence.append(n_seq) self.protein_sequence.append(p_seq) # self.protein_sequence.append(str(record.seq.translate()).replace('*', ' ')) # GC content self.gc_content.append(self.calculate_gc_content(n_seq)) # protein analysis methods analysis = ProteinAnalysis(p_seq) self.amino_acid_dict.append(analysis.get_amino_acids_percent()) self.molecular_weight.append(analysis.molecular_weight()) self.instability_index.append(analysis.instability_index()) self.aromaticity.append(analysis.aromaticity()) except Exception as e: print('-'*80) print(f"Exception in parsing uploaded virus sequence: {e}") traceback.print_exc(file=sys.stdout) print('-'*80)
def calculate_physiochemical_features(temp_dict, sequence): analyzed_seq = ProteinAnalysis(sequence) charge_at_pH7 = analyzed_seq.charge_at_pH(7) instability_index = analyzed_seq.instability_index() molecular_weight = analyzed_seq.molecular_weight() aromaticity = analyzed_seq.aromaticity() molar_extinction_coefficient = analyzed_seq.molar_extinction_coefficient() range_l, range_h = molar_extinction_coefficient molar_extinction_coefficient = (float(range_l) + float(range_h)) / 2 gravy = analyzed_seq.gravy( ) #Grand Average Hyrdopathy - Higher value = More Hydrophobic isoelectric_point = analyzed_seq.isoelectric_point() helix_fraction, turn_fraction, sheet_fraction = analyzed_seq.secondary_structure_fraction( ) physiochem_dict = { "Charge at pH7": charge_at_pH7, "Instability Index": instability_index, "Molecular Wt": molecular_weight, "Aromaticity": aromaticity, "Molar Extinction Coeff": molar_extinction_coefficient, "Gravy": gravy, "Isoelectric pt": isoelectric_point, "Helix Fraction": helix_fraction, "Turn Fraction": turn_fraction, "Sheet Fraction": sheet_fraction } temp_dict.update(physiochem_dict) #Adding separately because get_amino_acids_percent() generates a dictionary on its own aa_percent = analyzed_seq.get_amino_acids_percent() temp_dict.update(aa_percent)
def protparm(cudir, filename, name): fasta_sequence = SeqIO.parse(open(cudir + "/" + name + "/" + filename), "fasta") for fasta in fasta_sequence: name1, sequence = fasta.id, str(fasta.seq) ##print sequence X = ProteinAnalysis(sequence) ##print name1+"\t"+str(X.instability_index()) if float(round(X.instability_index(), 2)) < 40: ii = (round(X.instability_index(), 2)) stab = "stable" stab_coff = 1 else: ii = (round(X.instability_index(), 2)) stab = "unstable" stab_coff = 0 return ii, stab, stab_coff
def get_protein_analysis(aa): protein_analysis = ProteinAnalysis(aa) analyze = [protein_analysis.molecular_weight(), protein_analysis.aromaticity(), protein_analysis.instability_index(), protein_analysis.isoelectric_point(), protein_analysis.gravy()] + list( protein_analysis.secondary_structure_fraction()) return analyze
def get_instability_index(self): """ Calculates Instability index from sequence (1 value) from biopython :return: dictionary with the value of Instability index """ res = {} analysed_seq = ProteinAnalysis(self.ProteinSequence) res['Instability_index'] = analysed_seq.instability_index() return res
def properties(toxin_faa, antitoxin_faa, out): # Build a dictionary of {locus:[{properties:values},{properties:values}]} from collections import defaultdict loci = defaultdict(list) from Bio import SeqIO for f in [toxin_faa, antitoxin_faa]: # Parse FASTA files with open(f, 'rU') as handle: for record in SeqIO.parse(handle, 'fasta'): locus, start = getNameAndPosition(record) if not start: continue aaseq = str(record.seq).strip("*") # Omit sequences with missing positions or premature stops # give them 0 as flag for missing data instead if "*" not in aaseq and "X" not in aaseq: data = ProteinAnalysis(aaseq) loci[locus].append({ 'start': start, 'pI': data.isoelectric_point(), 'weight': data.molecular_weight(), 'instability': data.instability_index() }) else: loci[locus].append({ 'start': start, 'pI': 0, 'weight': 0, 'instability': 0 }) # Order genes in a locus positionally loci = orderPairs(loci) # Write to output fil outfile = ".".join([out, "properties", "txt"]) with open(outfile, 'w') as o: header = "\t".join([ "locus", "gene1_pI", "gene2_pI", "gene1_weight", "gene2_weight", "gene1_instability", "gene2_instability" ]) o.write("#" + header.upper() + "\n") for locus, gene in loci.iteritems(): if len(gene) != 2: continue line = map(str, [ locus, gene[0]['pI'], gene[1]['pI'], gene[0]['weight'], gene[1]['weight'], gene[0]['instability'], gene[1]['instability'] ]) o.write("\t".join(line) + "\n") return outfile
def pept_counter(self): """this class can be used to get some other peptide properties. """ if self.pept: pa = ProteinAnalysis(self.pept) inst = pa.instability_index() if inst > 0: self.inst = inst else: self.inst = 100
def make_dataset(fasta): # a list of dictionaries containing features for all sequences ls_features = [] # assign whether it's from tardigrades 'tar' or poplars 'pop' if 'tar' in fasta: target = 0 elif 'pop' in fasta: target = 1 for record in SeqIO.parse(fasta, "fasta"): analysed_seq = ProteinAnalysis(str(record.seq)) # the dictionary containing features for a single sequence dict_features = {} # compute length dict_features['length'] = len(record.seq) # compute molecular weight dict_features['mol_weight'] = analysed_seq.molecular_weight() # compute aromaticity dict_features['aromaticity'] = analysed_seq.molecular_weight() # compute stability dict_features['stability'] = analysed_seq.instability_index() # compute flexibility dict_features['flexibility'] = analysed_seq.flexibility() # compute isoelectric point dict_features['isoelectric'] = analysed_seq.isoelectric_point() # compute secondary structure fraction frac = analysed_seq.secondary_structure_fraction() dict_features['helix'] = frac[0] dict_features['turn'] = frac[1] dict_features['sheet'] = frac[2] # compute AAC composition of entire sequence aac = analysed_seq.get_amino_acids_percent() # merge all features and dictionaries into dict_features dict_features.update(aac) ls_features += [dict_features] df = pd.DataFrame(ls_features) df['target'] = target print(df) df.to_pickle(name + '_set.pkl')
def biopython_protein_analysis(inseq): """Utiize Biopython's ProteinAnalysis module to return general sequence properties of an amino acid string. For full definitions see: http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam.ProteinAnalysis-class.html Args: inseq: Amino acid sequence Returns: dict: Dictionary of sequence properties. Some definitions include: instability_index: Any value above 40 means the protein is unstable (has a short half life). secondary_structure_fraction: Percentage of protein in helix, turn or sheet TODO: Finish definitions of dictionary """ inseq = ssbio.protein.sequence.utils.cast_to_str(inseq) analysed_seq = ProteinAnalysis(inseq) info_dict = {} info_dict['amino_acids_content-biop'] = analysed_seq.count_amino_acids() info_dict[ 'amino_acids_percent-biop'] = analysed_seq.get_amino_acids_percent() info_dict['length-biop'] = analysed_seq.length info_dict['monoisotopic-biop'] = analysed_seq.monoisotopic info_dict['molecular_weight-biop'] = analysed_seq.molecular_weight() info_dict['aromaticity-biop'] = analysed_seq.aromaticity() info_dict['instability_index-biop'] = analysed_seq.instability_index() # TODO: What is flexibility? info_dict['flexibility-biop'] = analysed_seq.flexibility() info_dict['isoelectric_point-biop'] = analysed_seq.isoelectric_point() # grand average of hydrophobicity info_dict['gravy-biop'] = analysed_seq.gravy() # Separated secondary_structure_fraction into each definition # info_dict['secondary_structure_fraction-biop'] = analysed_seq.secondary_structure_fraction() info_dict[ 'percent_helix_naive-biop'] = analysed_seq.secondary_structure_fraction( )[0] info_dict[ 'percent_turn_naive-biop'] = analysed_seq.secondary_structure_fraction( )[1] info_dict[ 'percent_strand_naive-biop'] = analysed_seq.secondary_structure_fraction( )[2] return info_dict
def phyChemProps(seq): svv = [0 for x in range(10)] X = ProteinAnalysis(seq) svv[0] = X.aromaticity() svv[1] = X.secondary_structure_fraction()[0] svv[2] = X.secondary_structure_fraction()[1] svv[3] = X.secondary_structure_fraction()[2] svv[4] = X.gravy() svv[5] = X.instability_index() svv[6] = X.isoelectric_point() svv[7] = X.molecular_weight() svv[8] = X.molar_extinction_coefficient()[0] svv[9] = X.molar_extinction_coefficient()[1] return svv
def analysis(listofaas, outlist): for prot in listofaas: exc = 0 try: templist = [] p = ProteinAnalysis(prot) templist.append(p.molecular_weight()) templist.append(p.instability_index()) templist.append(p.isoelectric_point()) outlist.append(templist) except ValueError: exc = exc + 1 except KeyError: exc = exc + 1
def properties(toxin_faa,antitoxin_faa,out): # Build a dictionary of {locus:[{properties:values},{properties:values}]} from collections import defaultdict loci = defaultdict(list) from Bio import SeqIO for f in [toxin_faa,antitoxin_faa]: # Parse FASTA files with open(f,'rU') as handle: for record in SeqIO.parse(handle,'fasta'): locus,start = getNameAndPosition(record) if not start: continue aaseq = str(record.seq).strip("*") # Omit sequences with missing positions or premature stops # give them 0 as flag for missing data instead if "*" not in aaseq and "X" not in aaseq: data = ProteinAnalysis(aaseq) loci[locus].append({ 'start': start, 'pI': data.isoelectric_point(), 'weight': data.molecular_weight(), 'instability': data.instability_index() }) else: loci[locus].append({ 'start': start, 'pI': 0, 'weight':0 , 'instability': 0 }) # Order genes in a locus positionally loci = orderPairs(loci) # Write to output fil outfile = ".".join([out,"properties","txt"]) with open(outfile,'w') as o: header = "\t".join(["locus", "gene1_pI","gene2_pI", "gene1_weight","gene2_weight", "gene1_instability","gene2_instability" ]) o.write("#"+ header.upper() + "\n") for locus, gene in loci.iteritems(): if len(gene) != 2: continue line = map(str, [ locus,gene[0]['pI'],gene[1]['pI'], gene[0]['weight'],gene[1]['weight'], gene[0]['instability'],gene[1]['instability'] ]) o.write("\t".join(line)+"\n") return outfile
def __init__(self, sequence): self.sequence = sequence self.sequence_length = len(sequence) analysis = ProteinAnalysis(sequence) self.amino_acid_percents = analysis.get_amino_acids_percent() self.amino_acids_composition = calculate_amino_acids_composition(sequence) self.aromaticity = analysis.aromaticity() self.instability = analysis.instability_index() self.flexibility = calculate_flexibility(sequence) protein_scale_parameters = [{'name': 'Hydrophilicity', 'dictionary': hw}, {'name': 'Surface accessibility', 'dictionary': em}, {'name': 'Janin Interior to surface transfer energy scale', 'dictionary': ja}, {'name': 'Bulkiness', 'dictionary': bulkiness}, {'name': 'Polarity', 'dictionary': polarity}, {'name': 'Buried residues', 'dictionary': buried_residues}, {'name': 'Average area buried', 'dictionary': average_area_buried}, {'name': 'Retention time', 'dictionary': retention_time}] self.protein_scales = calculate_protein_scales(analysis, protein_scale_parameters) self.isoelectric_point = analysis.isoelectric_point() self.secondary_structure_fraction = calculate_secondary_structure_fraction(analysis) self.molecular_weight = analysis.molecular_weight() self.kyte_plot = analysis.gravy() self.pefing = calculate_pefing(sequence) # next parameters are calculated using R.Peptides r('require(Peptides)') r('sequence = "{0}"'.format(sequence)) self.aliphatic_index = r('aindex(sequence)')[0] self.boman_index = r('boman(sequence)')[0] self.charges = calculate_charges(sequence, 1.0, 14.0, 0.5, 'Lehninger') self.hydrophobicity = r('seq(sequence)')[0] angles = [{'name': 'Alpha-helix', 'angle': -47}, {'name': '3-10-helix', 'angle': -26}, {'name': 'Pi-helix', 'angle': -80}, {'name': 'Omega', 'angle': 180}, {'name': 'Antiparallel beta-sheet', 'angle': 135}, {'name': 'Parallel beta-sheet', 'angle': 113}] if self.amino_acid_percents['P'] + self.amino_acid_percents['G'] > 0.3: angles.append({'name': 'Polygly-polypro helix', 'angle': 153}) self.hydrophobic_moments = calculate_hydrophobic_moments(sequence, angles) self.kidera_factors = calculate_kidera_factors(sequence) self.peptide_types = calculate_peptide_types(sequence, angles)
def calculate_properties_from_sequence(self): """ Function to calculate some molecular properties based on RDKit functionalities Arguments: Sequence - amino acid sequence of the peptide Return: Average Eisenberg hydrophobicity ProtParam parameters: Isolectric point, aromaticity, instability index, amino acid percentage """ # Hydrophobicity -> Eisenberg scale hydrophobicity = { 'A': 0.620, 'R': -2.530, 'N': -0.780, 'D': -0.900, 'C': 0.290, 'Q': -0.850, 'E': -0.740, 'G': 0.480, 'H': -0.400, 'Y': 0.260, 'I': 1.380, 'L': 1.060, 'K': -1.500, 'M': 0.640, 'F': 1.190, 'P': 0.120, 'S': -0.180, 'T': -0.050, 'W': 0.810, 'V': 1.080 } self.avg_hydro = sum([hydrophobicity[resi] for resi in self.sequence]) # ProParam properties prot_parameters = ProteinAnalysis(self.sequence) self.aromaticity = prot_parameters.aromaticity() self.aa_percent = prot_parameters.get_amino_acids_percent() self.instability_index = prot_parameters.instability_index() self.isoelectric_point = prot_parameters.isoelectric_point()
def bio_feat(record): clean_seq = str(MutableSeq(record.seq)).replace("X", "") clean_seq = clean_seq.replace("U", "C") clean_seq = clean_seq.replace("B", "N") clean_seq = clean_seq.replace('Z', 'Q') clean_seq = MutableSeq(clean_seq).toseq() ### features seq_length = len(str(clean_seq)) analysed_seq = ProteinAnalysis(str(clean_seq)) molecular_weight = analysed_seq.molecular_weight() amino_percent = analysed_seq.get_amino_acids_percent().values() isoelectric_points = analysed_seq.isoelectric_point() count = analysed_seq.count_amino_acids().values() # aromaticity = analysed_seq.aromaticity() instability_index = analysed_seq.instability_index() # hydrophobicity = analysed_seq.protein_scale(ProtParamData.kd, 5, 0.4) secondary_structure_fraction = analysed_seq.secondary_structure_fraction() return np.array([seq_length, molecular_weight, isoelectric_points, instability_index] + list(secondary_structure_fraction) + list(count) + list(amino_percent))
def biochemical_properties(sequence: str) -> Dict[str, Any]: # Define objects used for calculations analysis_object = ProteinAnalysis(sequence) descriptor_object = PyPro.GetProDes(sequence) sequence_object = Seq(sequence) # TODO(Ahmed): Verify that all these calculations are actually returning reasonable values # For example, it says the percent composition of every amino acid is zero when I run # calculate_biochem_properties.biochemical_properties('qwertyipasdfghklcvnm') return { 'Isoelectric point': analysis_object.isoelectric_point(), 'Molecular weight': analysis_object.molecular_weight(), # Daltons? Amu? g/mol? 'Aromaticity': analysis_object.aromaticity(), 'Instability index': analysis_object.instability_index(), 'GRAVY': analysis_object.gravy(), 'H-bonding percent': h_bonding_percent(sequence), 'Melting temp': melting_temp(sequence), 'LCC': lcc.lcc_simp(sequence) }
def protein_analysis(): if session.username == None: redirect(URL(r=request,f='../account/log_in')) from Bio.SeqUtils.ProtParam import ProteinAnalysis form = FORM(TABLE( TR("Amino acid sequence: ", TEXTAREA(_type="text", _name="sequence", requires=IS_NOT_EMPTY())), INPUT(_type="submit", _value="SUBMIT"))) if form.accepts(request.vars,session): session['sequence'] = seqClean(form.vars.sequence.upper()) X = ProteinAnalysis(session['sequence']) session['aa_count'] = X.count_amino_acids() session['percent_aa'] = X.get_amino_acids_percent() session['mw'] = X.molecular_weight() session['aromaticity'] = X.aromaticity() session['instability'] = X.instability_index() session['flexibility'] = X.flexibility() session['pI'] = X.isoelectric_point() session['sec_struct'] = X.secondary_structure_fraction() redirect(URL(r=request, f='protein_analysis_output')) return dict(form=form)
def get_features(seq): """get global features from a protein sequence Parameters ---------- seq : str protein sequence Return ---------- dictionary: global features of the protein sequence """ features = {} features['undefined_count'] = len([x for x in seq if x in ['X','B','Z',"'",'O','U']]) features['length'] = len(seq) features['perc_undefined_count'] = features['undefined_count']/features['length'] features['entropy'] = entropy(seq) features['ideal_entropy'] = entropy_ideal(len(seq)) features['perc_entropy'] = features['entropy']/features['ideal_entropy'] features['hydr_count'] = sum(1 for x in seq if x in hydrophobic_proteins) features['polar_count'] = sum(1 for x in seq if x in polar_proteins) features['buried'] = sum(buried[x] for x in seq if x in hydrophobic_proteins) seq = ''.join([x for x in seq if x not in ['X','B','Z',"'",'O','U']]) protein = ProteinAnalysis(seq) features['gravy'] = protein.gravy() features['molecular_weight'] = protein.molecular_weight() features['aromaticity'] = protein.aromaticity() features['instability_index'] = protein.instability_index() features['isoelectric_point'] = protein.isoelectric_point() features['helix'], features['turn'], features['sheet'] = protein.secondary_structure_fraction() features.update(protein.count_amino_acids()) # features.update(protein.get_amino_acids_percent()) return features
def seqs_to_features(self, seqs, no_seqs): """ Extract the features from the sequences.""" X = np.zeros((no_seqs, 32)) for i, s in enumerate(chain(*seqs)): # iterate over all sequences # get amino acid counts alphabet = 'ABCDEFGHIKLMNPQRSTUVWXY' # no JOZ for j, letter in enumerate(alphabet): X[i, j] = s.count(letter) / len(s) # other analysis analysis = ProteinAnalysis( s.replace('X', 'A').replace('B', 'A').replace('U', 'A')) X[i, -1] = analysis.molecular_weight() X[i, -2] = analysis.aromaticity() X[i, -3] = analysis.instability_index() X[i, -4] = analysis.isoelectric_point() helix_array_sheet_fracs = analysis.secondary_structure_fraction() X[i, -5] = helix_array_sheet_fracs[0] X[i, -6] = helix_array_sheet_fracs[1] X[i, -7] = helix_array_sheet_fracs[2] X[i, -8] = len(s) X[i, -9] = analysis.gravy() # mean hydrophobicity return X
def GetFeatures (My_seq): Features = {} ProteinAnalysis(My_seq) analysed_seq = ProteinAnalysis(My_seq) #Caracteristicas monovaloradas Features["Molecular_weight"] = analysed_seq.molecular_weight() Features["Aromaticity"] = analysed_seq.aromaticity() Features["Instability_index"] = analysed_seq.instability_index() Features["Isoelectric_point"] = analysed_seq.isoelectric_point() #Caracteristicas multivaloradas Features["Flexibility"] = analysed_seq.flexibility() # List 580 Features["Second_structure_fraction"] = analysed_seq.secondary_structure_fraction() #3 Tupla Features["Count_amino_acids"] = analysed_seq.count_amino_acids() #20 Dict Features["Amino_acids_percent"] = analysed_seq.get_amino_acids_percent() #20 Dict return Features
def physchem_props(ara_d): """Calculate the physicochemical properties per protein in ara_d.""" c = 0 g = 0 for protein in ara_d: seq = ara_d[protein]["sequence"] # Calculates the properties if "X" in seq: continue # Skip non-usable sequences, only negs if '*' in seq: if ara_d[protein]["pos"] != []: print(protein) continue a_seq = ProteinAnalysis(seq) # Update ara_d with new physchem properties results = [ a_seq.molecular_weight(), a_seq.gravy(), a_seq.aromaticity(), a_seq.instability_index(), a_seq.flexibility(), a_seq.isoelectric_point(), a_seq.secondary_structure_fraction(), ] keys = [ "mol_weight", "gravy", "aromaticity", "instab_index", "flexi", "iso_point", "seq_struct", ] ara_d[protein]["Properties"] = {} for k, v in zip(keys, results): ara_d[protein]["Properties"][k] = v return ara_d
def parse_nuc_sequence(self, n_seq, id=None, desc=None): """ Parses valid RNA sequence, translates nucleotides, calculates GC content and other methods available from ProteinAnalysis() in BioPython module. Keyword arguments: seq -- valid string sequence id -- id obtained from FASTA file record (default None) desc -- description obtained from FASTA file record (default None) """ try: # append fasta sequence metadata self.id.append(id) self.description.append(desc) self.nucleotide_sequence.append(n_seq) # translate nucleotide string sequence p_seq = self.translate_nucleotides(n_seq) self.protein_sequence.append(p_seq) # self.protein_sequence.append(str(record.seq.translate()).replace('*', ' ')) # GC content self.gc_content.append(self.calculate_gc_content(n_seq)) # protein analysis methods analysis = ProteinAnalysis(p_seq) self.amino_acid_dict.append(analysis.get_amino_acids_percent()) self.molecular_weight.append(analysis.molecular_weight()) self.instability_index.append(analysis.instability_index()) self.aromaticity.append(analysis.aromaticity()) except Exception as e: print('-'*80) print(f"Exception in parsing uploaded virus sequence: {e}") traceback.print_exc(file=sys.stdout) print('-'*80)
def biopython_proteinanalysis_seq(seq, scaling=False): res = ProteinAnalysis(seq) d = {} flex = np.array(res.flexibility()) d['flex:min'], d['flex:max'], d['flex:std'] = flex.min(), flex.max( ), flex.std() d['gravy'] = res.gravy() d['instability_index'] = res.instability_index() d['isoelectric_point'] = res.isoelectric_point() r, c = res.molar_extinction_coefficient() d['molar_extinction_coefficient_reduced'], d[ 'molar_extinction_coefficient_cysteines'] = r, c d['molecular_weight'] = res.molecular_weight() d['percent_helix_naive'], d['percent_turn_naive'], d[ 'percent_strand_naive'] = res.secondary_structure_fraction() aap = res.get_amino_acids_percent() aas = sorted(aap.keys()) d.update({'percent:%s' % aa: aap[aa] for aa in aas}) d.update({ 'prop_res_%s' % key: sum([aap.get(x, 0) for x in value]) for key, value in list(property_residues.items()) }) return d
instidx=[] flex=[] for seq in sequences: X=ProteinAnalysis(str(seq)) isoelectricPt.append(X.isoelectric_point()) aromaticity.append(X.aromaticity()) aminoPercent.append(X.get_amino_acids_percent()) secstruct.append(X.secondary_structure_fraction()) # These features throw Key & Value Errors due to non standard amino acids # (i.e. out of the 20 standard ones) e.g. X, U etc try: gravy.append(X.gravy()) molweight.append(X.molecular_weight()) instidx.append(X.instability_index()) flex.append(X.flexibility()) hydrophob.append(X.protein_scale(ProtParamData.kd, 9, 0.4)) hydrophil.append(X.protein_scale(ProtParamData.hw, 9, 0.4)) surface.append(X.protein_scale(ProtParamData.em, 9, 0.4)) except (KeyError,ValueError): gravy.append(0) molweight.append(0) instidx.append(0) flex.append([0,0]) hydrophob.append([0,0]) hydrophil.append([0,0]) surface.append([0,0]) isoelectricPt_df = pd.DataFrame(isoelectricPt,columns=['isoelectricPt'])
print('done') with temppathlib.TemporaryDirectory() as tmpdir: # unzip the file with all the test PDBs with zipfile.ZipFile(args.infile, "r") as zip_: zip_.extractall(tmpdir.path) for test_pdb in tmpdir.path.glob("*.pdb"): for record in SeqIO.parse(test_pdb, "pdb-atom"): sequence = str(record.seq).replace('X', 'G') protein = ProteinAnalysis(str(sequence)) p_len.append(len(sequence)) mol_w.append(protein.molecular_weight()) iso_p.append(protein.isoelectric_point()) smell.append(protein.aromaticity()) taste_factor.append(protein.gravy()) insta_ind.append(protein.instability_index()) char_at_acid.append(protein.charge_at_pH(1)) char_at_neutral.append(protein.charge_at_pH(7)) char_at_base.append(protein.charge_at_pH(14)) helter_skeler.append(protein.secondary_structure_fraction()[0]) turnip.append(protein.secondary_structure_fraction()[1]) garfield.append(protein.secondary_structure_fraction()[2]) for x in amino_acids: n = protein.count_amino_acids()[x] for y in d_count.keys(): if y[-1] == x: d_count[y].append(n) for a in amino_acids: m = protein.get_amino_acids_percent()[a] for b in d_perc.keys(): if b[-1] == a:
from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.SeqUtils import ProtParamData from Bio import SeqIO with open('../../samples/pdbaa') as fh: for rec in SeqIO.parse(fh,'fasta'): myprot = ProteinAnalysis(str(rec.seq)) print(myprot.count_amino_acids()) print(myprot.get_amino_acids_percent()) print(myprot.molecular_weight()) print(myprot.aromaticity()) print(myprot.instability_index()) print(myprot.flexibility()) print(myprot.isoelectric_point()) print(myprot.secondary_structure_fraction()) print(myprot.protein_scale(ProtParamData.kd, 9, .4))
#!/usr/bin/env python import sys from Bio import SeqIO from Bio.SeqUtils.ProtParam import ProteinAnalysis sys.stdout.write("ID\tMW\tIP\tgravy\tlength\tinstability\tmonoisotpoic\tSequence\n") for record in SeqIO.parse(sys.stdin, "fasta"): a = ProteinAnalysis(str(record.seq)) properties = list() properties.append(record.id) properties.append(a.molecular_weight()) properties.append(a.isoelectric_point()) properties.append(a.gravy()) properties.append(a.length) properties.append(a.instability_index()) properties.append(a.aromaticity()) # always last column to make the output more readable properties.append(a.sequence) sys.stdout.write( '\t'.join(map(str, properties))+"\n" )
def main(databasePassword, schemaProteins, tableProteinInfo, tableStability): # Define N-terminus half life values (explanation http://en.wikipedia.org/wiki/N-end_rule and the ProtParam tool). halfLife = {'A' : 4.4, 'C' : 1.2, 'D' : 1.1, 'E' : 1.0, 'F' : 1.1, 'G' : 30.0, 'H' : 3.5, 'I' : 20.0, 'K' : 1.3, 'L' : 5.5, 'M' : 30.0, 'N' : 1.4, 'P' : 20.0, 'Q' : 0.8, 'R' : 1.0, 'S' : 1.9, 'T' : 7.2, 'V' : 100.0, 'W' : 2.8, 'Y' : 2.8} # Extract all the sequences stored in the database. conn, cursor = mysql.openConnection(databasePassword, schemaProteins) cursor = mysql.tableSELECT(cursor, 'UPAccession, Sequence', tableProteinInfo) results = cursor.fetchall() # Calculate the half life and instability index for each protein. stabilityTuples = [] for i in results: sequence = i[1] if halfLife.has_key(sequence[0]): protHalfLife = halfLife[sequence[0]] else: # This will occur when the N-terminal is not an amino acid with an associated half-life value (e.g. X, B, etc.) protHalfLife = -1 analysedSeq = ProteinAnalysis(sequence) try: instabilityIndex = analysedSeq.instability_index() except: instabilityIndex = -1 print '\tContains invalid aa code: ', i[0] stabilityTuples.append(tuple([i[0], protHalfLife, instabilityIndex])) cursor.execute('TRUNCATE TABLE ' + tableStability) values = '(' + ('%s,' * len(stabilityTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableStability, values, stabilityTuples) mysql.closeConnection(conn, cursor) #def instability_index(prot, sequence): # # # A two dimentional dictionary for calculating the instability index. # # Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990). # # It is based on dipeptide values therefore the vale for the dipeptide DG is DIWV['D']['G']. # DIWV = {'A': {'A': 1.0, 'C': 44.94, 'E': 1.0, 'D': -7.49, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': -7.49, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, # 'C': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 20.26, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 33.60, # 'K': 1.0, 'M': 33.60, 'L': 20.26, 'N': 1.0, # 'Q': -6.54, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': 33.60, 'W': 24.68, 'V': -6.54, 'Y': 1.0}, # 'E': {'A': 1.0, 'C': 44.94, 'E': 33.60, 'D': 20.26, # 'G': 1.0, 'F': 1.0, 'I': 20.26, 'H': -6.54, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': 1.0, # 'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0}, # 'D': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0, # 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 1.0, 'P': 1.0, 'S': 20.26, 'R': -6.54, # 'T': -14.03, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, # 'F': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 13.34, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': -14.03, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 33.601}, # 'I': {'A': 1.0, 'C': 1.0, 'E': 44.94, 'D': 1.0, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 13.34, # 'K': -7.49, 'M': 1.0, 'L': 20.26, 'N': 1.0, # 'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, # 'G': {'A': -7.49, 'C': 1.0, 'E': -6.54, 'D': 1.0, # 'G': 13.34, 'F': 1.0, 'I': -7.49, 'H': 1.0, # 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': -7.49, # 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0, # 'T': -7.49, 'W': 13.34, 'V': 1.0, 'Y': -7.49}, # 'H': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': -9.37, 'F': -9.37, 'I': 44.94, 'H': 1.0, # 'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 24.68, # 'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0, # 'T': -6.54, 'W': -1.88, 'V': 1.0, 'Y': 44.94}, # 'K': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': -7.49, 'F': 1.0, 'I': -7.49, 'H': 1.0, # 'K': 1.0, 'M': 33.60, 'L': -7.49, 'N': 1.0, # 'Q': 24.64, 'P': -6.54, 'S': 1.0, 'R': 33.60, # 'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, # 'M': {'A': 13.34, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 58.28, # 'K': 1.0, 'M': -1.88, 'L': 1.0, 'N': 1.0, # 'Q': -6.54, 'P': 44.94, 'S': 44.94, 'R': -6.54, # 'T': -1.88, 'W': 1.0, 'V': 1.0, 'Y': 24.68}, # 'L': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 33.60, 'P': 20.26, 'S': 1.0, 'R': 20.26, # 'T': 1.0, 'W': 24.68, 'V': 1.0, 'Y': 1.0}, # 'N': {'A': 1.0, 'C': -1.88, 'E': 1.0, 'D': 1.0, # 'G': -14.03, 'F': -14.03, 'I': 44.94, 'H': 1.0, # 'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': -6.54, 'P': -1.88, 'S': 1.0, 'R': 1.0, # 'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 1.0}, # 'Q': {'A': 1.0, 'C': -6.54, 'E': 20.26, 'D': 20.26, # 'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': -6.54, 'Y': -6.54}, # 'P': {'A': 20.26, 'C': -6.54, 'E': 18.38, 'D': -6.54, # 'G': 1.0, 'F': 20.26, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': -6.54, 'L': 1.0, 'N': 1.0, # 'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': -6.54, # 'T': 1.0, 'W': -1.88, 'V': 20.26, 'Y': 1.0}, # 'S': {'A': 1.0, 'C': 33.60, 'E': 20.26, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 44.94, 'S': 20.26, 'R': 20.26, # 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, # 'R': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 20.26, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 13.34, 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 58.28, # 'T': 1.0, 'W': 58.28, 'V': 1.0, 'Y': -6.54}, # 'T': {'A': 1.0, 'C': 1.0, 'E': 20.26, 'D': 1.0, 'G': -7.49, 'F': 13.34, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': -14.03, 'Q': -6.54, 'P': 1.0, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0}, # 'W': {'A': -14.03, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -9.37, 'F': 1.0, 'I': 1.0, 'H': 24.68, # 'K': 1.0, 'M': 24.68, 'L': 13.34, 'N': 13.34, 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0, # 'T': -14.03, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, # 'V': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': -14.03, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': -1.88, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': -7.49, 'W': 1.0, 'V': 1.0, 'Y': -6.54}, # 'Y': {'A': 24.68, 'C': 1.0, 'E': -6.54, 'D': 24.68, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 13.34, # 'K': 1.0, 'M': 44.94, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 13.34, 'S': 1.0, 'R': -15.91, # 'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 13.34}, # } # # score = 0.0 # for i in range(len(sequence) - 1): # if DIWV.has_key(sequence[i]): # if DIWV[sequence[i]].has_key(sequence[i+1]): # score += DIWV[sequence[i]][sequence[i+1]] # return (10.0 / len(sequence)) * score