def main(): #programm, mis kysib valgu fasta faili ja annab selle kohta parameetrid fasta = input() sequence = read_fasta(fasta) print(sequence) analysed_seq = ProteinAnalysis(str(sequence)) print("\n","Molekulaarmass:",analysed_seq.molecular_weight()) print("\n","Aminohapete arv:",analysed_seq.count_amino_acids()) print("\n","Isoelektriline punkt:",analysed_seq.isoelectric_point()) text_file = open("Valgu_parameetrid.txt", "w") text_file.write(str(analysed_seq.molecular_weight())) text_file.write("\n") text_file.write(str(analysed_seq.count_amino_acids())) text_file.write("\n") text_file.write(str(analysed_seq.isoelectric_point())) text_file.close()
def protParam(seq): params = ProteinAnalysis(seq) mw = params.molecular_weight() c_aa = params.count_amino_acids() p_aa = params.get_amino_acids_percent() gravy = params.gravy() aromaticity = params.aromaticity() isoelectric_point = params.isoelectric_point() ext_coeff = sum([c_aa["W"]*5690,c_aa["Y"]*1280,c_aa["C"]*120]) mgml = ext_coeff * (1./mw) print("Amino acid count") pprint.pprint(c_aa) print("Amino acid percent") pprint.pprint(p_aa) print("Molecular weight") print("%f Da"%mw) print("Gravy") print(gravy) print("Isoelectric point") print(isoelectric_point) print("Aromaticity") print(aromaticity) print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)"%ext_coeff) print("")
def get_protein_analysis(aa): protein_analysis = ProteinAnalysis(aa) analyze = [protein_analysis.molecular_weight(), protein_analysis.aromaticity(), protein_analysis.instability_index(), protein_analysis.isoelectric_point(), protein_analysis.gravy()] + list( protein_analysis.secondary_structure_fraction()) return analyze
def prot_feats_seq(seq): aa = [ 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' ] f = [] X = ProteinAnalysis(str(seq)) X.molecular_weight( ) #throws an error if 'X' in sequence. we skip such sequences p = X.get_amino_acids_percent() dp = [] for a in aa: dp.append(p[a]) dp = np.array(dp) dp = normalize(np.atleast_2d(dp), norm='l2', copy=True, axis=1, return_norm=False) f.extend(dp[0]) tm = np.array(twomerFromSeq(str(seq))) tm = normalize(np.atleast_2d(tm), norm='l2', copy=True, axis=1, return_norm=False) f.extend(tm[0]) thm = np.array(threemerFromSeq(str(seq))) thm = normalize(np.atleast_2d(thm), norm='l2', copy=True, axis=1, return_norm=False) f.extend(thm[0]) return np.array(f)
def properties(toxin_faa, antitoxin_faa, out): # Build a dictionary of {locus:[{properties:values},{properties:values}]} from collections import defaultdict loci = defaultdict(list) from Bio import SeqIO for f in [toxin_faa, antitoxin_faa]: # Parse FASTA files with open(f, 'rU') as handle: for record in SeqIO.parse(handle, 'fasta'): locus, start = getNameAndPosition(record) if not start: continue aaseq = str(record.seq).strip("*") # Omit sequences with missing positions or premature stops # give them 0 as flag for missing data instead if "*" not in aaseq and "X" not in aaseq: data = ProteinAnalysis(aaseq) loci[locus].append({ 'start': start, 'pI': data.isoelectric_point(), 'weight': data.molecular_weight(), 'instability': data.instability_index() }) else: loci[locus].append({ 'start': start, 'pI': 0, 'weight': 0, 'instability': 0 }) # Order genes in a locus positionally loci = orderPairs(loci) # Write to output fil outfile = ".".join([out, "properties", "txt"]) with open(outfile, 'w') as o: header = "\t".join([ "locus", "gene1_pI", "gene2_pI", "gene1_weight", "gene2_weight", "gene1_instability", "gene2_instability" ]) o.write("#" + header.upper() + "\n") for locus, gene in loci.iteritems(): if len(gene) != 2: continue line = map(str, [ locus, gene[0]['pI'], gene[1]['pI'], gene[0]['weight'], gene[1]['weight'], gene[0]['instability'], gene[1]['instability'] ]) o.write("\t".join(line) + "\n") return outfile
def molecular_weight_printer(filename): ppb = PPBuilder() MW_final = 0 print(filename[25:29]) for pp in ppb.build_peptides(structure(filename)): seq = pp.get_sequence() seqstring = str(seq) analysed_seq = ProteinAnalysis(seqstring) MW = analysed_seq.molecular_weight() MW_final += MW print(MW_final)
def checksize(peptide): #only peptides with a minimal length of 6 amino acids #and a molecular weigth between 400 and 6000 are accepted if len(peptide) >= 6: analysed_seq = ProteinAnalysis(seq_weight_corrections(peptide)) mol_weight = analysed_seq.molecular_weight() if mol_weight >= 400 and mol_weight <= 6000: return True else: return False else: return False
def get_params(fasta_file, out_file): with open(out_file, "w") as out: out.write("UniprotID,MW,pI\n") with open(fasta_file, "r") as handle: for record in SeqIO.parse(handle, "fasta"): analysed_seq = ProteinAnalysis( str(record.seq).replace("X", "")) outstring = record.id + "," + str( analysed_seq.molecular_weight()) + "," + str( analysed_seq.isoelectric_point()) + "\n" out.write(outstring)
def molecular_weight(filename): ppb = PPBuilder() MW_final = 0 # return(filename[25:29]) #use above comment to return pbd identifier along with the MW if desired for pp in ppb.build_peptides(structure(filename)): seq = pp.get_sequence() seqstring = str(seq) analysed_seq = ProteinAnalysis(seqstring) MW = analysed_seq.molecular_weight() MW_final += MW return(MW_final)
def prot_feats(filename): XX = [] ids = [] for rec in SeqIO.parse(filename, "fasta"): f = [] X = ProteinAnalysis(str(rec.seq)) # import pdb; pdb.set_trace() try: X.molecular_weight( ) #throws an error if 'X' in sequence. we skip such sequences f = list(prot_feats_seq(str(rec.seq))) # XX.append(f) ids.append(rec.id) except: continue XX = np.array(XX) # import pdb; pdb.set_trace() return XX, ids
def test_molecular_weight(self): "Test Lantipeptide.molecular_weight" lant = Lantipeptide(23, 42, 17, 'Class-I') lant.core = "MAGICHAT" analysis = ProteinAnalysis("MAGICHAT", monoisotopic=False) mw = analysis.molecular_weight() # Thr is assumed to be dehydrated mw -= 18.02 self.assertAlmostEqual(mw, lant.molecular_weight) self.assertAlmostEqual(mw, lant._weight) lant._weight = 42 self.assertEqual(42, lant.molecular_weight)
def biopython_protein_analysis(inseq): """Utiize Biopython's ProteinAnalysis module to return general sequence properties of an amino acid string. For full definitions see: http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam.ProteinAnalysis-class.html Args: inseq: Amino acid sequence Returns: dict: Dictionary of sequence properties. Some definitions include: instability_index: Any value above 40 means the protein is unstable (has a short half life). secondary_structure_fraction: Percentage of protein in helix, turn or sheet TODO: Finish definitions of dictionary """ inseq = ssbio.protein.sequence.utils.cast_to_str(inseq) analysed_seq = ProteinAnalysis(inseq) info_dict = {} info_dict['amino_acids_content-biop'] = analysed_seq.count_amino_acids() info_dict[ 'amino_acids_percent-biop'] = analysed_seq.get_amino_acids_percent() info_dict['length-biop'] = analysed_seq.length info_dict['monoisotopic-biop'] = analysed_seq.monoisotopic info_dict['molecular_weight-biop'] = analysed_seq.molecular_weight() info_dict['aromaticity-biop'] = analysed_seq.aromaticity() info_dict['instability_index-biop'] = analysed_seq.instability_index() # TODO: What is flexibility? info_dict['flexibility-biop'] = analysed_seq.flexibility() info_dict['isoelectric_point-biop'] = analysed_seq.isoelectric_point() # grand average of hydrophobicity info_dict['gravy-biop'] = analysed_seq.gravy() # Separated secondary_structure_fraction into each definition # info_dict['secondary_structure_fraction-biop'] = analysed_seq.secondary_structure_fraction() info_dict[ 'percent_helix_naive-biop'] = analysed_seq.secondary_structure_fraction( )[0] info_dict[ 'percent_turn_naive-biop'] = analysed_seq.secondary_structure_fraction( )[1] info_dict[ 'percent_strand_naive-biop'] = analysed_seq.secondary_structure_fraction( )[2] return info_dict
def find_composition(df_original): df_copy = df_original.copy() column_names = [] for ch in codes: column_names.append(ch + '_percent') column_names.append(ch + '_percent_first') column_names.append(ch + '_percent_last') column_names.append('len') column_names.append('weight') column_names.append('gravy') column_names.append('flex_mean') column_names.append('flex_std') column_names.append('ss_helix') column_names.append('ss_turn') column_names.append('ss_sheet') column_names.append('iep') column_names.append('aromaticity') df = pd.DataFrame(columns=column_names) for _, seq in enumerate(tqdm(df_copy['seq'])): df_temp = pd.Series() sequence = str(seq) analysed = ProteinAnalysis(sequence) analysed_first = ProteinAnalysis(sequence[:first_n]) analysed_last = ProteinAnalysis(sequence[-last_n:]) df_temp['len'] = analysed.length df_temp['ss_helix'], df_temp['ss_turn'], df_temp['ss_sheet'] = analysed.secondary_structure_fraction() df_temp['iep'] = analysed.isoelectric_point() # overall for aa, percent in analysed.get_amino_acids_percent().items(): df_temp[aa + '_percent'] = percent # # first N for aa, percent in analysed_first.get_amino_acids_percent().items(): df_temp[aa + '_percent_first'] = percent # last N for aa, percent in analysed_last.get_amino_acids_percent().items(): df_temp[aa + '_percent_last'] = percent df_temp['weight'] = analysed.molecular_weight() df_temp['gravy'] = analysed.gravy() df_temp['aromaticity'] = analysed.aromaticity() df_temp['flex_mean'] = np.mean(analysed.flexibility()) df_temp['flex_std'] = np.std(analysed.flexibility()) df = df.append(df_temp, ignore_index=True) return pd.concat([df_copy, df], axis=1)
def analysis(listofaas, outlist): for prot in listofaas: exc = 0 try: templist = [] p = ProteinAnalysis(prot) templist.append(p.molecular_weight()) templist.append(p.instability_index()) templist.append(p.isoelectric_point()) outlist.append(templist) except ValueError: exc = exc + 1 except KeyError: exc = exc + 1
def molecular_weight(self): #overrides base class if self._contains_unknown: new_seq = '' for aa in self._sequence: if aa not in UNKNOWNS: new_seq += aa new_p = ProteinAnalysis(new_seq) mw = new_p.molecular_weight() #just increase by avg mw of known aa's mw *= len(self._sequence) / len(new_seq) else: mw = super(Protein, self).molecular_weight() return mw
def phyChemProps(seq): svv = [0 for x in range(10)] X = ProteinAnalysis(seq) svv[0] = X.aromaticity() svv[1] = X.secondary_structure_fraction()[0] svv[2] = X.secondary_structure_fraction()[1] svv[3] = X.secondary_structure_fraction()[2] svv[4] = X.gravy() svv[5] = X.instability_index() svv[6] = X.isoelectric_point() svv[7] = X.molecular_weight() svv[8] = X.molar_extinction_coefficient()[0] svv[9] = X.molar_extinction_coefficient()[1] return svv
def pep_param(pep): lanA_param = ProteinAnalysis(pep) lanA_mw = lanA_param.molecular_weight() params = [lanA_mw] if len(pep) > 0: lanA_pI = lanA_param.isoelectric_point() else: lanA_pI = 'na' params.extend([lanA_pI]) lanA_AAs = lanA_param.count_amino_acids().values() params.extend(lanA_AAs) return params
def molecular_weight(fastas): #seq_new=seq.replace('X','').replace('B','') encodings3 = [] header = ["Weight"] encodings3.append(header) for i in fastas: name, sequence = i[0], re.sub('-', '', i[1]) code = [] analysed_seq = ProteinAnalysis(sequence) analysed_seq.monoisotopic = True mw = analysed_seq.molecular_weight() Normw = (mw - 513.222346) / (9577.017286 - 513.222346) code.append(Normw) encodings3.append(code) return (encodings3)
def properties(toxin_faa,antitoxin_faa,out): # Build a dictionary of {locus:[{properties:values},{properties:values}]} from collections import defaultdict loci = defaultdict(list) from Bio import SeqIO for f in [toxin_faa,antitoxin_faa]: # Parse FASTA files with open(f,'rU') as handle: for record in SeqIO.parse(handle,'fasta'): locus,start = getNameAndPosition(record) if not start: continue aaseq = str(record.seq).strip("*") # Omit sequences with missing positions or premature stops # give them 0 as flag for missing data instead if "*" not in aaseq and "X" not in aaseq: data = ProteinAnalysis(aaseq) loci[locus].append({ 'start': start, 'pI': data.isoelectric_point(), 'weight': data.molecular_weight(), 'instability': data.instability_index() }) else: loci[locus].append({ 'start': start, 'pI': 0, 'weight':0 , 'instability': 0 }) # Order genes in a locus positionally loci = orderPairs(loci) # Write to output fil outfile = ".".join([out,"properties","txt"]) with open(outfile,'w') as o: header = "\t".join(["locus", "gene1_pI","gene2_pI", "gene1_weight","gene2_weight", "gene1_instability","gene2_instability" ]) o.write("#"+ header.upper() + "\n") for locus, gene in loci.iteritems(): if len(gene) != 2: continue line = map(str, [ locus,gene[0]['pI'],gene[1]['pI'], gene[0]['weight'],gene[1]['weight'], gene[0]['instability'],gene[1]['instability'] ]) o.write("\t".join(line)+"\n") return outfile
def compute_molecular_weight(aa_seq): """Description: calculates the molecular weight from an amino acid sequence Parameters: aa_seq : the amino acid sequence (must be string object) Return: returns the molecular weight of the amino acid sequence Example of usage: >> MolecularWeight = compute_molecular_weight(aa_seq) >> print(MolecularWeight) Output (String): [molecular weight of amino acid sequence] """ analysed_seq = ProteinAnalysis(aa_seq) weight = analysed_seq.molecular_weight() return weight
def bio_feat(record): clean_seq = str(MutableSeq(record.seq)).replace("X", "") clean_seq = clean_seq.replace("U", "C") clean_seq = clean_seq.replace("B", "N") clean_seq = clean_seq.replace('Z', 'Q') clean_seq = MutableSeq(clean_seq).toseq() ### features seq_length = len(str(clean_seq)) analysed_seq = ProteinAnalysis(str(clean_seq)) molecular_weight = analysed_seq.molecular_weight() amino_percent = analysed_seq.get_amino_acids_percent().values() isoelectric_points = analysed_seq.isoelectric_point() count = analysed_seq.count_amino_acids().values() # aromaticity = analysed_seq.aromaticity() instability_index = analysed_seq.instability_index() # hydrophobicity = analysed_seq.protein_scale(ProtParamData.kd, 5, 0.4) secondary_structure_fraction = analysed_seq.secondary_structure_fraction() return np.array([seq_length, molecular_weight, isoelectric_points, instability_index] + list(secondary_structure_fraction) + list(count) + list(amino_percent))
def biochemical_properties(sequence: str) -> Dict[str, Any]: # Define objects used for calculations analysis_object = ProteinAnalysis(sequence) descriptor_object = PyPro.GetProDes(sequence) sequence_object = Seq(sequence) # TODO(Ahmed): Verify that all these calculations are actually returning reasonable values # For example, it says the percent composition of every amino acid is zero when I run # calculate_biochem_properties.biochemical_properties('qwertyipasdfghklcvnm') return { 'Isoelectric point': analysis_object.isoelectric_point(), 'Molecular weight': analysis_object.molecular_weight(), # Daltons? Amu? g/mol? 'Aromaticity': analysis_object.aromaticity(), 'Instability index': analysis_object.instability_index(), 'GRAVY': analysis_object.gravy(), 'H-bonding percent': h_bonding_percent(sequence), 'Melting temp': melting_temp(sequence), 'LCC': lcc.lcc_simp(sequence) }
def amino_acid_analysis(self): """ Adds fraction of amino acid residues (defined in RESIDUES) to data frame. """ for res in RESIDUES: self.df["fraction_" + res] = ( self.df["sequence"].str.count(res) / self.df["sequence"].str.len() ) self.df["length"] = self.df["sequence"].str.len() for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0]): # for index, row in self.df.iterrows(): seq = row["sequence"] seqanalysis = ProteinAnalysis(seq) acidist = seqanalysis.get_amino_acids_percent() self.df.loc[index, "IEP"] = seqanalysis.isoelectric_point() if "X" not in seq and "B" not in seq: self.df.loc[index, "molecular_weight"] = seqanalysis.molecular_weight() if "U" not in seq and "X" not in seq and "B" not in seq: self.df.loc[index, "gravy"] = seqanalysis.gravy()
def __init__(self, sequence): self.sequence = sequence self.sequence_length = len(sequence) analysis = ProteinAnalysis(sequence) self.amino_acid_percents = analysis.get_amino_acids_percent() self.amino_acids_composition = calculate_amino_acids_composition(sequence) self.aromaticity = analysis.aromaticity() self.instability = analysis.instability_index() self.flexibility = calculate_flexibility(sequence) protein_scale_parameters = [{'name': 'Hydrophilicity', 'dictionary': hw}, {'name': 'Surface accessibility', 'dictionary': em}, {'name': 'Janin Interior to surface transfer energy scale', 'dictionary': ja}, {'name': 'Bulkiness', 'dictionary': bulkiness}, {'name': 'Polarity', 'dictionary': polarity}, {'name': 'Buried residues', 'dictionary': buried_residues}, {'name': 'Average area buried', 'dictionary': average_area_buried}, {'name': 'Retention time', 'dictionary': retention_time}] self.protein_scales = calculate_protein_scales(analysis, protein_scale_parameters) self.isoelectric_point = analysis.isoelectric_point() self.secondary_structure_fraction = calculate_secondary_structure_fraction(analysis) self.molecular_weight = analysis.molecular_weight() self.kyte_plot = analysis.gravy() self.pefing = calculate_pefing(sequence) # next parameters are calculated using R.Peptides r('require(Peptides)') r('sequence = "{0}"'.format(sequence)) self.aliphatic_index = r('aindex(sequence)')[0] self.boman_index = r('boman(sequence)')[0] self.charges = calculate_charges(sequence, 1.0, 14.0, 0.5, 'Lehninger') self.hydrophobicity = r('seq(sequence)')[0] angles = [{'name': 'Alpha-helix', 'angle': -47}, {'name': '3-10-helix', 'angle': -26}, {'name': 'Pi-helix', 'angle': -80}, {'name': 'Omega', 'angle': 180}, {'name': 'Antiparallel beta-sheet', 'angle': 135}, {'name': 'Parallel beta-sheet', 'angle': 113}] if self.amino_acid_percents['P'] + self.amino_acid_percents['G'] > 0.3: angles.append({'name': 'Polygly-polypro helix', 'angle': 153}) self.hydrophobic_moments = calculate_hydrophobic_moments(sequence, angles) self.kidera_factors = calculate_kidera_factors(sequence) self.peptide_types = calculate_peptide_types(sequence, angles)
def protein_analysis(): if session.username == None: redirect(URL(r=request,f='../account/log_in')) from Bio.SeqUtils.ProtParam import ProteinAnalysis form = FORM(TABLE( TR("Amino acid sequence: ", TEXTAREA(_type="text", _name="sequence", requires=IS_NOT_EMPTY())), INPUT(_type="submit", _value="SUBMIT"))) if form.accepts(request.vars,session): session['sequence'] = seqClean(form.vars.sequence.upper()) X = ProteinAnalysis(session['sequence']) session['aa_count'] = X.count_amino_acids() session['percent_aa'] = X.get_amino_acids_percent() session['mw'] = X.molecular_weight() session['aromaticity'] = X.aromaticity() session['instability'] = X.instability_index() session['flexibility'] = X.flexibility() session['pI'] = X.isoelectric_point() session['sec_struct'] = X.secondary_structure_fraction() redirect(URL(r=request, f='protein_analysis_output')) return dict(form=form)
def analyze_proteins(cdss): for cds in cdss: seq = ProteinAnalysis(cds['sequence']) seq_stats = OrderedDict() try: seq_stats['molecular_weight'] = seq.molecular_weight() except: log.warning( 'could not calc molecular weight! contig=%s, start=%i, stop=%i, strand=%s, frame=%s', cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['frame']) seq_stats['molecular_weight'] = float('nan') try: seq_stats['isoelectric_point'] = seq.isoelectric_point() except: log.warning( 'could not calc isoelectric point! contig=%s, start=%i, stop=%i, strand=%s, frame=%s', cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['frame']) seq_stats['isoelectric_point'] = float('nan') cds['seq_stats'] = seq_stats
def get_features(seq): """get global features from a protein sequence Parameters ---------- seq : str protein sequence Return ---------- dictionary: global features of the protein sequence """ features = {} features['undefined_count'] = len([x for x in seq if x in ['X','B','Z',"'",'O','U']]) features['length'] = len(seq) features['perc_undefined_count'] = features['undefined_count']/features['length'] features['entropy'] = entropy(seq) features['ideal_entropy'] = entropy_ideal(len(seq)) features['perc_entropy'] = features['entropy']/features['ideal_entropy'] features['hydr_count'] = sum(1 for x in seq if x in hydrophobic_proteins) features['polar_count'] = sum(1 for x in seq if x in polar_proteins) features['buried'] = sum(buried[x] for x in seq if x in hydrophobic_proteins) seq = ''.join([x for x in seq if x not in ['X','B','Z',"'",'O','U']]) protein = ProteinAnalysis(seq) features['gravy'] = protein.gravy() features['molecular_weight'] = protein.molecular_weight() features['aromaticity'] = protein.aromaticity() features['instability_index'] = protein.instability_index() features['isoelectric_point'] = protein.isoelectric_point() features['helix'], features['turn'], features['sheet'] = protein.secondary_structure_fraction() features.update(protein.count_amino_acids()) # features.update(protein.get_amino_acids_percent()) return features
def GetFeatures (My_seq): Features = {} ProteinAnalysis(My_seq) analysed_seq = ProteinAnalysis(My_seq) #Caracteristicas monovaloradas Features["Molecular_weight"] = analysed_seq.molecular_weight() Features["Aromaticity"] = analysed_seq.aromaticity() Features["Instability_index"] = analysed_seq.instability_index() Features["Isoelectric_point"] = analysed_seq.isoelectric_point() #Caracteristicas multivaloradas Features["Flexibility"] = analysed_seq.flexibility() # List 580 Features["Second_structure_fraction"] = analysed_seq.secondary_structure_fraction() #3 Tupla Features["Count_amino_acids"] = analysed_seq.count_amino_acids() #20 Dict Features["Amino_acids_percent"] = analysed_seq.get_amino_acids_percent() #20 Dict return Features
def seqs_to_features(self, seqs, no_seqs): """ Extract the features from the sequences.""" X = np.zeros((no_seqs, 32)) for i, s in enumerate(chain(*seqs)): # iterate over all sequences # get amino acid counts alphabet = 'ABCDEFGHIKLMNPQRSTUVWXY' # no JOZ for j, letter in enumerate(alphabet): X[i, j] = s.count(letter) / len(s) # other analysis analysis = ProteinAnalysis( s.replace('X', 'A').replace('B', 'A').replace('U', 'A')) X[i, -1] = analysis.molecular_weight() X[i, -2] = analysis.aromaticity() X[i, -3] = analysis.instability_index() X[i, -4] = analysis.isoelectric_point() helix_array_sheet_fracs = analysis.secondary_structure_fraction() X[i, -5] = helix_array_sheet_fracs[0] X[i, -6] = helix_array_sheet_fracs[1] X[i, -7] = helix_array_sheet_fracs[2] X[i, -8] = len(s) X[i, -9] = analysis.gravy() # mean hydrophobicity return X
def __init__(self, sequence): self.seq = sequence self.prop = dict() # if the residues are ambiguous self.extended = False # check if there exist ambiguous residues for res, possible_res in ProtSeqProp.trans_dict.items(): if res in sequence: self.extended = True if not self.extended: PA_analysis = ProteinAnalysis(sequence) # self.prop['secondary_structure_fraction']=PA_analysis.secondary_structure_fraction() self.prop['mean_molecular_weight'] = ( (PA_analysis.molecular_weight() / len(self.seq)) - 136.90020499999997) / 30.081960849078222 self.prop['mean_surface_accessibility'] = np.mean( [ProtSeqProp.em[res] for res in list(sequence)]) self.prop['mean_surface_accessibility'] = np.mean( [ProtSeqProp.em[res] for res in list(sequence)]) self.prop['mean_kd_hydrophobicity'] = np.mean( [ProtSeqProp.kd[res] for res in list(sequence)]) self.prop['mean_flexibility'] = np.mean( [ProtSeqProp.Flex[res] for res in list(sequence)]) self.prop['mean_hydrophilicity'] = np.mean( [ProtSeqProp.hw[res] for res in list(sequence)]) self.prop['mean_ja'] = np.mean( [ProtSeqProp.ja[res] for res in list(sequence)]) alph_list = list(sequence) self.prop['instability'] = np.mean([ ProtSeqProp.DIWV[x][y] for x, y in zip(alph_list[:-1], alph_list[1:]) ])
def get_coord_array(path, file_name): ''' Function: get coord array of all atoms in a pdb file :param path: the path of pdb file of all proteins :param file_name: the file name of ****.pdb :return: atom coord array [[x0,y0,z0], [x1,y1,z1], ... , [xn,yn,zn]] : charge of first model : mass of first model ''' parser = PDBParser(PERMISSIVE=1) structure_id = file_name.split('.')[0] path_file_name = path + file_name structure = parser.get_structure(structure_id, path_file_name) # Extract mass and charge from first model mass, charge = 0.0, 0.0 polypep_builder = PPBuilder() for polypep in polypep_builder.build_peptides(structure): analyzer = ProteinAnalysis(polypep.get_sequence()) mass += analyzer.molecular_weight() charge += analyzer.charge_at_pH(7.4) atom_coord_list = [] for model in structure: for chain in model: for residue in chain: for atom in residue: atom_coord = atom.get_coord() atom_coord_list.append(atom_coord) atom_coord_array = np.array(atom_coord_list) # print file_name,'atom_coo_array\n',atom_coord_array,'\n' # print('get_coord_array DONE!\t', path, ": ", file_name) return atom_coord_array, charge, mass
def physchem_props(ara_d): """Calculate the physicochemical properties per protein in ara_d.""" c = 0 g = 0 for protein in ara_d: seq = ara_d[protein]["sequence"] # Calculates the properties if "X" in seq: continue # Skip non-usable sequences, only negs if '*' in seq: if ara_d[protein]["pos"] != []: print(protein) continue a_seq = ProteinAnalysis(seq) # Update ara_d with new physchem properties results = [ a_seq.molecular_weight(), a_seq.gravy(), a_seq.aromaticity(), a_seq.instability_index(), a_seq.flexibility(), a_seq.isoelectric_point(), a_seq.secondary_structure_fraction(), ] keys = [ "mol_weight", "gravy", "aromaticity", "instab_index", "flexi", "iso_point", "seq_struct", ] ara_d[protein]["Properties"] = {} for k, v in zip(keys, results): ara_d[protein]["Properties"][k] = v return ara_d
def on_enter(self, *args): #what happens as you enter screen #3 sequence_identity = ObjectProperty(None) # reads the no_header_sequence.txt file to calculate Mw in kDa noHeader = open("no_header_sequence.txt").read() print("noHeader: ", noHeader) analysed_seq = ProteinAnalysis(noHeader) Mw = analysed_seq.molecular_weight() # Mw g/mol Mw_kDa = round(Mw / 1000, 3) # Mw kDa print(analysed_seq.count_amino_acids() ) # Dictionary with count for each amino acid heaviness = str(Mw_kDa) + " kDa" self.weight.text = heaviness # updates protein weight in kDa on the screen statinfo = os.stat('my_blast.xml') size = statinfo.st_size if size == 0: #if no xml file created sequence_identity = "BLAST search failed.\nCheck your FASTA file and try again." else: result_handle = open("my_blast.xml") blast_record = NCBIXML.read(result_handle) counter = 1 for alignment in blast_record.alignments: for hsp in alignment.hsps: if counter < 2: #takes only the first result sequence_identity = alignment.hit_def print("hit_def:", alignment.hit_def) title_split = sequence_identity.split('>') reduced_title = title_split[0] print(title_split[0]) counter = counter + 1 self.protname.text = reduced_title #updates sequence identity on the app screen
def parse_nuc_sequence(self, n_seq, id=None, desc=None): """ Parses valid RNA sequence, translates nucleotides, calculates GC content and other methods available from ProteinAnalysis() in BioPython module. Keyword arguments: seq -- valid string sequence id -- id obtained from FASTA file record (default None) desc -- description obtained from FASTA file record (default None) """ try: # append fasta sequence metadata self.id.append(id) self.description.append(desc) self.nucleotide_sequence.append(n_seq) # translate nucleotide string sequence p_seq = self.translate_nucleotides(n_seq) self.protein_sequence.append(p_seq) # self.protein_sequence.append(str(record.seq.translate()).replace('*', ' ')) # GC content self.gc_content.append(self.calculate_gc_content(n_seq)) # protein analysis methods analysis = ProteinAnalysis(p_seq) self.amino_acid_dict.append(analysis.get_amino_acids_percent()) self.molecular_weight.append(analysis.molecular_weight()) self.instability_index.append(analysis.instability_index()) self.aromaticity.append(analysis.aromaticity()) except Exception as e: print('-'*80) print(f"Exception in parsing uploaded virus sequence: {e}") traceback.print_exc(file=sys.stdout) print('-'*80)
def biopython_proteinanalysis_seq(seq, scaling=False): res = ProteinAnalysis(seq) d = {} flex = np.array(res.flexibility()) d['flex:min'], d['flex:max'], d['flex:std'] = flex.min(), flex.max( ), flex.std() d['gravy'] = res.gravy() d['instability_index'] = res.instability_index() d['isoelectric_point'] = res.isoelectric_point() r, c = res.molar_extinction_coefficient() d['molar_extinction_coefficient_reduced'], d[ 'molar_extinction_coefficient_cysteines'] = r, c d['molecular_weight'] = res.molecular_weight() d['percent_helix_naive'], d['percent_turn_naive'], d[ 'percent_strand_naive'] = res.secondary_structure_fraction() aap = res.get_amino_acids_percent() aas = sorted(aap.keys()) d.update({'percent:%s' % aa: aap[aa] for aa in aas}) d.update({ 'prop_res_%s' % key: sum([aap.get(x, 0) for x in value]) for key, value in list(property_residues.items()) }) return d
def draw_sequence(sequence, mode = 'simple', alphabet = None): if mode == 'protparams': returndiv = DIV() from Bio.SeqUtils.ProtParam import ProteinAnalysis seq_div=DIV(_style='font-family:monospace',_class='raw-sequence') spacer=len(str(len(sequence)))+1 for i,pos in enumerate(sequence): if i==0: seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ',' '))) if i%10==0 and i!=0: seq_div.append(' ') if i%60==0 and i!=0: seq_div.append(XML((str(i)).ljust(spacer).replace(' ',' '))) seq_div.append(BR()) seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ',' '))) seq_div.append(SPAN(pos,_class='seq-position',_title = i+1)) returndiv.append(seq_div) returndiv.append(H3('Protein Parameters')) params_table = TABLE(_style= "width:200px;") protpar=ProteinAnalysis(sequence) params_table.append(TR(SPAN('Length:',_class = 'line-header'), '%i aa'%len(sequence))) try: params_table.append(TR(SPAN('MW:',_class = 'line-header'), '%i KDa'%round(protpar.molecular_weight()/1000,0))) except KeyError: pass try: params_table.append(TR(SPAN('pI:',_class = 'line-header'), '%1.2f'%protpar.isoelectric_point())) except KeyError: pass returndiv.append(params_table) return returndiv if mode == 'simple': seq_div=DIV(_style='font-family:monospace',_class='raw-sequence') spacer=len(str(len(sequence)))+1 for i,pos in enumerate(sequence): if i==0: seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ',' '))) if i%10==0 and i!=0: seq_div.append(' ') if i%60==0 and i!=0: seq_div.append(XML((str(i)).ljust(spacer).replace(' ',' '))) seq_div.append(BR()) seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ',' '))) seq_div.append(SPAN(pos,_class='seq-position', _title = i+1)) return seq_div
from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.SeqUtils import ProtParamData from Bio import SeqIO with open('../../samples/pdbaa') as fh: for rec in SeqIO.parse(fh,'fasta'): myprot = ProteinAnalysis(str(rec.seq)) print(myprot.count_amino_acids()) print(myprot.get_amino_acids_percent()) print(myprot.molecular_weight()) print(myprot.aromaticity()) print(myprot.instability_index()) print(myprot.flexibility()) print(myprot.isoelectric_point()) print(myprot.secondary_structure_fraction()) print(myprot.protein_scale(ProtParamData.kd, 9, .4))
#!/usr/bin/env python import sys from Bio import SeqIO from Bio.SeqUtils.ProtParam import ProteinAnalysis sys.stdout.write("ID\tMW\tIP\tgravy\tlength\tinstability\tmonoisotpoic\tSequence\n") for record in SeqIO.parse(sys.stdin, "fasta"): a = ProteinAnalysis(str(record.seq)) properties = list() properties.append(record.id) properties.append(a.molecular_weight()) properties.append(a.isoelectric_point()) properties.append(a.gravy()) properties.append(a.length) properties.append(a.instability_index()) properties.append(a.aromaticity()) # always last column to make the output more readable properties.append(a.sequence) sys.stdout.write( '\t'.join(map(str, properties))+"\n" )
def main(argv): ## we use ArgumentParser, which requires 2.7 if sys.version_info < (2, 7): raise "This script requires python 2.7 or greater" ## add weight filtering functionality if BioPython is available try: from Bio.SeqUtils.ProtParam import ProteinAnalysis has_biopython = 1 except : has_biopython = 0 parser = argparse.ArgumentParser(description='Add abundance to FASTA files.') parser.add_argument('infile', type=argparse.FileType('r'), help='Input FASTA file') parser.add_argument('outfile', type=argparse.FileType('w'), help='Output FASTA file') parser.add_argument('--mu', dest='mu', action='store', default=3, help='mean of gaussian in log space') parser.add_argument('--sigma', dest='sigma', action='store', default=1, help='sd of gaussian in log space') parser.add_argument('--sample', dest='sample', action='store', default=0, help='Number of entries to keep (for sampling a bigger FASTA file)') parser.add_argument('--random', dest='random', action='store_true', help='Randomly shuffle entries before sampling (only if --sample is given). If not given, the first \'X\' samples are used.') if (has_biopython): parser.add_argument('--weight_low', dest='weight_low', action='store', default=0, help='minimum molecular weight of protein') parser.add_argument('--weight_up', dest='weight_up', action='store', default=0, help='Maximum molecular weight of protein (use 0 for unlimited)') else: print "Warning: protein weight filtering not supported, as BioPython module is not installed." ## argument parsing args = parser.parse_args() fileobj = args.infile fileoutobj = args.outfile sample_size = int(args.sample) sample_random = bool(args.random) if (has_biopython): weight_low = float(args.weight_low) weight_up = float(args.weight_up) if (weight_up <= 0): weight_up = sys.float_info.max ## list of final entries fasta_entries = [] for entry in nextEntry(fileobj): header = entry.header ## check if it contains 'intensity'? rep = re.compile(r"\[# *(.*) *#\]") m = rep.search(header) header_new = "" other = [] if (m): header_new = header.replace(m.group(0), "") ## delete meta for element in m.group(1).split(','): #print "element:", element if (element.find("intensity") == -1): other.append(element) else: header_new = header ## nothing to replace ## create new metainfo array i = "intensity=" + str(sampleAbundance(float(args.mu), float(args.sigma))) other.append(i) entry.header = header_new.rstrip() + "[# " + (", ").join(other) + " #]" if (has_biopython): sequence = "".join(entry.sequence.split("\n")) ## ## BioPython does not like some AA letters - they need replacement ## ## replace "U" (Selenocystein) with "C" (Cystein) sequence = sequence.replace("U","C") ## replace "X" (unknown) with "P" (Proline) [arbitrary choice - but weight of 115 is very close to averagine] sequence = sequence.replace("X","P") ## replace "B" (Asparagine or aspartic acid) with "N" (Asparagine) sequence = sequence.replace("B","N") ## replace "Z" (Glutamine or glutamic acid) with "Q" (Glutamine) sequence = sequence.replace("Z","Q") ## replace "Z" (Glutamine or glutamic acid) with "Q" (Glutamine) sequence = sequence.replace("Z","Q") ## replace "J" (Leucine or Isoleucine) with "L" (Leucine) sequence = sequence.replace("J","L") analysed_seq = ProteinAnalysis(sequence) weight = analysed_seq.molecular_weight() if (not(weight_low <= weight and weight <= weight_up)): continue fasta_entries.append(entry.header + "\n" + entry.sequence) ## only read to sample size (the rest is thrown away anyways) if (sample_size > 0 and not(sample_random)): if (len(fasta_entries) >= sample_size): break ## select subset (if required) if (sample_size > 0): indices = range(0,len(fasta_entries)) ## random sampling only makes sense if we take a subset if (sample_random and sample_size < len(fasta_entries)): random.shuffle(indices) indices = [indices[i] for i in range(0,sample_size)] fasta_entries = [fasta_entries[i] for i in indices] ## write to file for entry in fasta_entries: fileoutobj.write(entry)
data_mwt = [] y_axis = [] x_axis = data_mwt for record in SeqIO.parse(seq_file, "fasta"): #for record in SeqIO.parse(seq_file, "fasta"): temp_seq=str(record.seq) analysis_seq=ProteinAnalysis(temp_seq) if ("ribosomal protein" in record.description or "ribosomal subunit" in record.description): #if ("ribosomal protein" in record.description or "ribosomal subunit" in record.description or "Ribosomal" in record.description): if (analysis_seq.molecular_weight() < 20000): data_mwt.append('%.2f'%(analysis_seq.molecular_weight())) y_axis.append(1) text_out.setTextColor(QColor('blue')) text_out.append(str(len(data_mwt)) + "," + record.description + "," + '%.2f'%(analysis_seq.molecular_weight()) + "," + '%.2f'%(analysis_seq.isoelectric_point())) #new=sorted(data_mwt) #data_mwt.append(list(zip(['%.2f'%(analysis_seq.molecular_weight())]))) #print(record.description + " = " + '%.2f'%(analysis_seq.molecular_weight())) csv_write = csv.writer(output) #row_wise = zip([record.description],['%.2f'%(analysis_seq.molecular_weight())],['%.2f'%(analysis_seq.isoelectric_point())])
from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.SeqUtils import ProtParamData import sys import json inp = json.loads(sys.argv[1]) seq = inp["Sequence"] X = ProteinAnalysis(seq) data = dict() if "MW" in inp["Options"]: data["MW"] = X.molecular_weight() if "EC280" in inp["Options"]: aa_count = X.count_amino_acids() if "hasDisulfide" in inp["Options"]: data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"] + 62.5 * aa_count["C"] else: data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"] if "PI" in inp["Options"]: data["PI"] = X.isoelectric_point() if "AACont" in inp["Options"]: ratios = X.get_amino_acids_percent() data["AACont"] = {aa: ratios[aa] * 100. for aa in ratios} print json.dumps(data)