def get_aa_count_dict(seq): p = ProteinAnalysis(seq) return { 'A': p.count_amino_acids()['A'], 'C': p.count_amino_acids()['C'], 'D': p.count_amino_acids()['D'], 'E': p.count_amino_acids()['E'], 'F': p.count_amino_acids()['F'], 'G': p.count_amino_acids()['G'], 'H': p.count_amino_acids()['H'], 'I': p.count_amino_acids()['I'], 'K': p.count_amino_acids()['K'], 'L': p.count_amino_acids()['L'], 'M': p.count_amino_acids()['M'], 'N': p.count_amino_acids()['N'], 'P': p.count_amino_acids()['P'], 'Q': p.count_amino_acids()['Q'], 'R': p.count_amino_acids()['R'], 'S': p.count_amino_acids()['S'], 'T': p.count_amino_acids()['T'], 'V': p.count_amino_acids()['V'], 'W': p.count_amino_acids()['W'], 'Y': p.count_amino_acids()['Y'], }
def main(): #programm, mis kysib valgu fasta faili ja annab selle kohta parameetrid fasta = input() sequence = read_fasta(fasta) print(sequence) analysed_seq = ProteinAnalysis(str(sequence)) print("\n","Molekulaarmass:",analysed_seq.molecular_weight()) print("\n","Aminohapete arv:",analysed_seq.count_amino_acids()) print("\n","Isoelektriline punkt:",analysed_seq.isoelectric_point()) text_file = open("Valgu_parameetrid.txt", "w") text_file.write(str(analysed_seq.molecular_weight())) text_file.write("\n") text_file.write(str(analysed_seq.count_amino_acids())) text_file.write("\n") text_file.write(str(analysed_seq.isoelectric_point())) text_file.close()
def protparams(aa_seq, vstarts, vstops): """Compute a set of parameters for a polypepeptide, which would helps assess the potenial of this peptide as a crystalization candidate. """ MWs = [] pIs = [] epsilons = [] for start in vstarts: for stop in vstops: if int(start) < int(stop): params = PA(aa_seq[int(start):int(stop)] ) # works with string or Seq objects MW = params.molecular_weight() MW = round(MW / 1000, 1) # in kiloDalton, rounded to 1 decimal pI = round(params.isoelectric_point(), 1) # To calculate the epsilon, we use this formula from protparam (web.expasy.org/protparam) # Epsilon (Prot) = N(Tyr)*Ext(Tyr) + N(Trp)*Ext(Trp) + N(Cystine)*Ext(Cystine) / MW in Dalton aa_dict = params.count_amino_acids( ) # returns a dict {'aa' : count } where aa is one letter code for the aminoacid epsilon = round((aa_dict['Y'] * 1490 + aa_dict['W'] * 5500 + aa_dict['C'] * 125) / (MW * 1000), 2) MWs.append(MW) pIs.append(pI) epsilons.append(epsilon) return MWs, pIs, epsilons
def aa_composition(seq): protein = ProteinAnalysis(seq) aa = protein.count_amino_acids() aacomp = 'A:\t%i,' % aa['A'] aacomp += 'C:\t%i,' % aa['C'] aacomp += 'E:\t%i,' % aa['E'] aacomp += 'D:\t%i,' % aa['D'] aacomp += 'G:\t%i,' % aa['G'] aacomp += 'F:\t%i,' % aa['F'] aacomp += 'I:\t%i,' % aa['I'] aacomp += 'H:\t%i,' % aa['H'] aacomp += 'K:\t%i,' % aa['K'] aacomp += 'M:\t%i,' % aa['M'] aacomp += 'L:\t%i,' % aa['L'] aacomp += 'N:\t%i,' % aa['N'] aacomp += 'Q:\t%i,' % aa['Q'] aacomp += 'P:\t%i,' % aa['P'] aacomp += 'S:\t%i,' % aa['S'] aacomp += 'R:\t%i,' % aa['R'] aacomp += 'T:\t%i,' % aa['T'] aacomp += 'W:\t%i,' % aa['W'] aacomp += 'V:\t%i,' % aa['V'] aacomp += 'Y:\t%i,' % aa['Y'] aacomp = aacomp.split(",") return aacomp
def aa_comp_calc(): peptides = [ 'A', 'G', 'P', 'S', 'T', 'C', 'F', 'W', 'Y', 'H', 'R', 'K', 'M', 'I', 'L', 'V', 'N', 'D', 'E', 'Q' ] if not os.path.isdir(args.output): os.mkdir(args.output) with open(args.input, 'r') as infile, open(f'{args.output}/aa_comp.tsv', 'w') as outfile: outfile.write('Taxon\t' + '\t'.join(peptides) + '\n') # Reads in input file for record in SeqIO.parse(infile, format=args.in_format): outfile.write(f'{record.id}\t') analysed_seq = ProteinAnalysis(str(record.seq)) count_dict = analysed_seq.count_amino_acids() length = len( str(record.seq).replace("-", "").replace("X", "").replace("*", "")) out_str = '' # Loops through peptides and checks to see if it is in count_dict for pep in peptides: if pep in count_dict.keys(): out_str += f'{float(count_dict[pep]) / length}\t' else: out_str += '0\t' outfile.write(out_str.strip() + '\n')
def protein_properties(seq): """Return a tuple with some protein biochemical properties seq is a Bio.Seq.Seq or str representing protein sequence """ pa = ProteinAnalysis(seq) aa_counts = pa.count_amino_acids() arom = pa.aromaticity() isoelec = pa.isoelectric_point() try: instability = pa.instability_index() except KeyError: instability = None try: gravy = pa.gravy() except KeyError: gravy = None return ProtProp(aa=str(seq), gravy=gravy, aromaticity=arom, isoelectric_point=isoelec, instability=instability, aa_counts=aa_counts)
def protParam(seq): params = ProteinAnalysis(seq) mw = params.molecular_weight() c_aa = params.count_amino_acids() p_aa = params.get_amino_acids_percent() gravy = params.gravy() aromaticity = params.aromaticity() isoelectric_point = params.isoelectric_point() ext_coeff = sum([c_aa["W"] * 5690, c_aa["Y"] * 1280, c_aa["C"] * 120]) mgml = ext_coeff * (1. / mw) print("Amino acid count") pprint.pprint(c_aa) print("Amino acid percent") pprint.pprint(p_aa) print("Molecular weight") print("%f Da" % mw) print("Gravy") print(gravy) print("Isoelectric point") print(isoelectric_point) print("Aromaticity") print(aromaticity) print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)" % ext_coeff) print("")
def protParam(seq): params = ProteinAnalysis(seq) mw = params.molecular_weight() c_aa = params.count_amino_acids() p_aa = params.get_amino_acids_percent() gravy = params.gravy() aromaticity = params.aromaticity() isoelectric_point = params.isoelectric_point() ext_coeff = sum([c_aa["W"]*5690,c_aa["Y"]*1280,c_aa["C"]*120]) mgml = ext_coeff * (1./mw) print("Amino acid count") pprint.pprint(c_aa) print("Amino acid percent") pprint.pprint(p_aa) print("Molecular weight") print("%f Da"%mw) print("Gravy") print(gravy) print("Isoelectric point") print(isoelectric_point) print("Aromaticity") print(aromaticity) print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)"%ext_coeff) print("")
def protein_analysis(): if session.username == None: redirect(URL(r=request, c='account', f='log_in')) from Bio.SeqUtils.ProtParam import ProteinAnalysis form = FORM( TABLE( TR( "Amino acid sequence: ", TEXTAREA(_type="text", _name="sequence", requires=IS_NOT_EMPTY())), INPUT(_type="submit", _value="SUBMIT"))) if form.accepts(request.vars, session): session['sequence'] = seqClean(form.vars.sequence.upper()) X = ProteinAnalysis(session['sequence']) session['aa_count'] = X.count_amino_acids() session['percent_aa'] = X.get_amino_acids_percent() session['mw'] = X.molecular_weight() session['aromaticity'] = X.aromaticity() session['instability'] = X.instability_index() session['flexibility'] = X.flexibility() session['pI'] = X.isoelectric_point() session['sec_struct'] = X.secondary_structure_fraction() redirect(URL(r=request, f='protein_analysis_output')) return dict(form=form)
def get_protein_features(seq): seq = correct(seq) prot_analysis = ProteinAnalysis(seq) prot_weight = molecular_weight(seq) pI = prot_analysis.isoelectric_point() aa_count = prot_analysis.count_amino_acids() neg_charged_residues = aa_count['D'] + aa_count['E'] pos_charged_residues = aa_count['K'] + aa_count['R'] extinction_coefficient_1 = aa_count['Y'] * 1490 + aa_count['W'] * 5500 extinction_coefficient_2 = aa_count['Y'] * 1490 + aa_count[ 'W'] * 5500 + aa_count['C'] * 125 instability_idx = instability_index(seq) gravy = hydrophobicity(seq) secondary_structure_fraction = [ frac for frac in prot_analysis.secondary_structure_fraction() ] names = [ 'length', 'weight', 'pI', 'neg_charged_residues', 'pos_charged_residues', 'extinction_coeff1', 'extinction_coeff2', 'instability_index', 'gravy', 'helix', 'turn', 'sheet' ] return names, [ len(seq), prot_weight, pI, neg_charged_residues, pos_charged_residues, extinction_coefficient_1, extinction_coefficient_2, instability_idx, gravy, *secondary_structure_fraction ]
def aminoacid(self): cds = self.feature() translatedseq = cds.translate() # print(translatedseq) protein = ProteinAnalysis(str(translatedseq)) aminodic = protein.count_amino_acids() aminolist = list(aminodic.values()) return aminodic, aminolist
def processFile(iterator, output): ### This loop prints the protein count of all the record### for record in iterator: ### gets the sequence from the record thisSeq = record.seq '''This prevents a Biopython warning from showing up if there are incomplete codons. Appends an N (wildcard) to the sequence until it is divisible by 3. This is essentially what is recommended in the Biopython warning message.''' if (len(thisSeq) % 3 != 0): leftoverNucCount = len(thisSeq) % 3 for i in range(3 - leftoverNucCount): thisSeq = Seq((str(thisSeq) + "N"), thisSeq.alphabet) ## translates the record as a new amino acid/peptide sequence translated_sequence = thisSeq.translate() ### This changes the sequence to a ProteinAnalysis object, which lets us call the needed methods on it. ### analyzed_sequence = ProteinAnalysis(str(translated_sequence)) output.write("Name: {0}\nDescription: {1}\nAnnotations: {2}".format( record.name, record.description, record.annotations)) ### gets the amino acid count aminoAcidCountDictionary = formatAminoAcids( analyzed_sequence.count_amino_acids()) ### prints the amino acid count! output.write("\n\nThis is the amino acid count of record {0}:".format( record.id) + "\n\n") ### splits the output so that each Amino Acid gets it's own line for aminoAcid, count in aminoAcidCountDictionary.items(): output.write(aminoAcid + ": " + str(count) + "\n") ###print(aminoAcidCountDictionary) ### turns the sequence into RNA thisSeqRNA = thisSeq.transcribe() analyzed_RNAsequence = ProteinAnalysis(str(thisSeqRNA.translate())) """ Since the RNA is the same as the DNA with the exception of one nucleotide, getting an amino acid count from the RNA should be the same as the amino acid count from the DNA. """ output.write( "\n\nThis is the amino acid count of the protein sequence derived from the RNA resulting from " "the DNA sequence in the file. It should be the same as the previous amino acid count: \n\n" ) rnaAcidCountDictionary = formatAminoAcids( analyzed_RNAsequence.count_amino_acids()) for aminoAcid, count in rnaAcidCountDictionary.items(): output.write(aminoAcid + ": " + str(count) + "\n") output.write('\n**************************************\n\n')
def getMF(subSeq): listofaminoacids = [] #Dictionary for each amino acid with atoms for each A = {'C':3, 'H':7, 'N':1, 'O':2, 'S':0} R = {'C':6, 'H':14,'N':4, 'O':2, 'S':0} N = {'C':4, 'H':8, 'N':2, 'O':3, 'S':0} D = {'C':4, 'H':7, 'N':1, 'O':4, 'S':0} C = {'C':3, 'H':7, 'N':1, 'O':2, 'S':1} Q = {'C':5, 'H':10,'N':2, 'O':3, 'S':0} E = {'C':5, 'H':9, 'N':1, 'O':4, 'S':0} G = {'C':2, 'H':5, 'N':1, 'O':2, 'S':0} H = {'C':6, 'H':9, 'N':3, 'O':2, 'S':0} I = {'C':6, 'H':13,'N':1, 'O':2, 'S':0} L = {'C':6, 'H':13,'N':1, 'O':2, 'S':0} K = {'C':6, 'H':14,'N':2, 'O':2, 'S':0} M = {'C':5, 'H':11,'N':1, 'O':2, 'S':1} F = {'C':9, 'H':11,'N':1, 'O':2, 'S':0} P = {'C':5, 'H':9, 'N':1, 'O':2, 'S':0} S = {'C':3, 'H':7, 'N':1, 'O':3, 'S':0} T = {'C':4, 'H':9, 'N':1, 'O':3, 'S':0} W = {'C':11,'H':12,'N':2, 'O':2, 'S':0} Y = {'C':9, 'H':11,'N':1, 'O':3, 'S':0} V = {'C':5, 'H':11,'N':1, 'O':2, 'S':0} dictOfAmino = {'A':A,'R':R,'N':N,'D':D,'C':C,'Q':Q, 'E':E, 'G':G,'H':H,'I':I,'L':L,'K':K,'M':M,'F':F,'P':P,'S':S,'T':T,'W':W,'Y':Y,'V':V} mySeq = subSeq analysis = ProteinAnalysis(mySeq) listofaminoacids.append(analysis.count_amino_acids()) for i in listofaminoacids: carbonTotal = 0 hydrogenTotal = 0 oxygenTotal = 0 nitrogenTotal = 0 sulfurTotal = 0 peptideBonds = 0 for value in i: for amino in dictOfAmino: if value == amino: peptideBonds = peptideBonds + i[value] thisAmino = {} thisAmino = dictOfAmino[amino] carbonTotal = carbonTotal + (i[value]*thisAmino['C']) hydrogenTotal = hydrogenTotal + (i[value]*thisAmino['H']) oxygenTotal = oxygenTotal + (i[value]*thisAmino['O']) nitrogenTotal = nitrogenTotal + (i[value]*thisAmino['N']) sulfurTotal = sulfurTotal + (i[value]*thisAmino['S']) #Correcting totals for peptide bond loss of water peptideBonds = peptideBonds - 1 hydrogenTotal = hydrogenTotal -(peptideBonds*2) oxygenTotal = oxygenTotal - (peptideBonds*1) outString = "C" + str(carbonTotal) + "H" + str(hydrogenTotal) + "N" + str(nitrogenTotal) + "O" + str(oxygenTotal) + "S" + str(sulfurTotal) return outString
def net_charge(seq): """Get net charge of a peptide sequence""" X = ProteinAnalysis(seq) ac = 0 ba = 0 for aa, i in X.count_amino_acids().iteritems(): if aa in ['D','E']: ac -= i elif aa in ['K','R']: ba += i return ac + ba
def convert_to_aac(dataset): i = 0 j = 0 X_aac = np.zeros((len(dataset), 20)) for seq in dataset: analysed_seq = ProteinAnalysis(str(seq.seq)) for val in analysed_seq.count_amino_acids().values(): X_aac[i][j] = val / len(seq) j += 1 i += 1 j = 0 return X_aac
def aa_frequency(outfile): fasta_sequences = SeqIO.parse(open(outfile),'fasta') all_seq="" for record in fasta_sequences: name, sequence = record.id, record.seq #x=ProteinAnalysis(str(record.seq)) #print(record.id, x.count_amino_acids()) all_seq=all_seq+str(sequence) #print(all_seq) y=ProteinAnalysis(str(all_seq)) print("all_seq_n", y.count_amino_acids()) print("all_seq_%", y.get_amino_acids_percent())
def biopython_protein_analysis(inseq): """Utiize Biopython's ProteinAnalysis module to return general sequence properties of an amino acid string. For full definitions see: http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam.ProteinAnalysis-class.html Args: inseq: Amino acid sequence Returns: dict: Dictionary of sequence properties. Some definitions include: instability_index: Any value above 40 means the protein is unstable (has a short half life). secondary_structure_fraction: Percentage of protein in helix, turn or sheet TODO: Finish definitions of dictionary """ inseq = ssbio.protein.sequence.utils.cast_to_str(inseq) analysed_seq = ProteinAnalysis(inseq) info_dict = {} info_dict['amino_acids_content-biop'] = analysed_seq.count_amino_acids() info_dict[ 'amino_acids_percent-biop'] = analysed_seq.get_amino_acids_percent() info_dict['length-biop'] = analysed_seq.length info_dict['monoisotopic-biop'] = analysed_seq.monoisotopic info_dict['molecular_weight-biop'] = analysed_seq.molecular_weight() info_dict['aromaticity-biop'] = analysed_seq.aromaticity() info_dict['instability_index-biop'] = analysed_seq.instability_index() # TODO: What is flexibility? info_dict['flexibility-biop'] = analysed_seq.flexibility() info_dict['isoelectric_point-biop'] = analysed_seq.isoelectric_point() # grand average of hydrophobicity info_dict['gravy-biop'] = analysed_seq.gravy() # Separated secondary_structure_fraction into each definition # info_dict['secondary_structure_fraction-biop'] = analysed_seq.secondary_structure_fraction() info_dict[ 'percent_helix_naive-biop'] = analysed_seq.secondary_structure_fraction( )[0] info_dict[ 'percent_turn_naive-biop'] = analysed_seq.secondary_structure_fraction( )[1] info_dict[ 'percent_strand_naive-biop'] = analysed_seq.secondary_structure_fraction( )[2] return info_dict
def pep_param(pep): lanA_param = ProteinAnalysis(pep) lanA_mw = lanA_param.molecular_weight() params = [lanA_mw] if len(pep) > 0: lanA_pI = lanA_param.isoelectric_point() else: lanA_pI = 'na' params.extend([lanA_pI]) lanA_AAs = lanA_param.count_amino_acids().values() params.extend(lanA_AAs) return params
def get_AAfraction(seq, amino_acids=None): """Get fraction of give amino acids in a sequence""" X = ProteinAnalysis(seq) #h = X.protein_scale(ProtParam.ProtParamData.kd, len(seq), 0.4) nonpolar = ['A','V','L','F','I','W','P'] if amino_acids == None: amino_acids = nonpolar count=0 for aa, i in X.count_amino_acids().iteritems(): if aa in amino_acids: count+=i if count == 0: return 0 frac = round(float(count)/len(seq),2) return frac
def getMW_mono(subSeq): peptideBonds = 0 molecularWeight = 0.0 waterLoss = 18.015 listofaminoacids = [] #MONOISOTOPIC MW FOR EACH AMINO ACID CURRENTLY dictOfAmino = {'A':71.03711, 'R':156.10111, 'N':114.04293, 'D':115.02694, 'C':103.00919, 'Q':128.05858, 'E':129.04259, 'G':57.02146, 'H':137.05891, 'I':113.08406, 'L':113.08406, 'K':128.09496, 'M':131.04049, 'F':147.06841, 'P':97.05276, 'S':87.03203, 'T':101.04768, 'W':186.07931, 'Y':163.06333, 'V':99.06841} mySeq = subSeq analysis = ProteinAnalysis(mySeq) listofaminoacids.append(analysis.count_amino_acids()) for i in listofaminoacids: for value in i: for amino in dictOfAmino: if value == amino: peptideBonds = peptideBonds + i[value] #print dictOfAmino[value] #print i[value] molecularWeight = molecularWeight + (i[value]*dictOfAmino[value]) #peptideBonds = peptideBonds - 1 #molecularWeight = molecularWeight - (peptideBonds*waterLoss) molecularWeight = molecularWeight+waterLoss return molecularWeight
def getMW_average(subSeq): peptideBonds = 0 molecularWeight = 0.0 waterLoss = 18.015 listofaminoacids = [] #AVERAGE MW FOR EACH AMINO ACID CURRENTLY dictOfAmino = {'A':71.0788, 'R':156.1875, 'N':114.1038, 'D':115.0886, 'C':103.1388, 'Q':128.1307, 'E':129.1155, 'G':57.0519, 'H':137.1411, 'I':113.1594, 'L':113.1594, 'K':128.1741, 'M':131.1926, 'F':147.1766, 'P':97.1167, 'S':87.0782, 'T':101.1051, 'W':186.2132, 'Y':163.1760, 'V':99.1326} mySeq = subSeq analysis = ProteinAnalysis(mySeq) listofaminoacids.append(analysis.count_amino_acids()) for i in listofaminoacids: for value in i: for amino in dictOfAmino: if value == amino: peptideBonds = peptideBonds + i[value] #print dictOfAmino[value] #print i[value] molecularWeight = molecularWeight + (i[value]*dictOfAmino[value]) #peptideBonds = peptideBonds - 1 #molecularWeight = molecularWeight - (peptideBonds*waterLoss) molecularWeight = molecularWeight+waterLoss return molecularWeight
def aa_composition(self, seqs): all_aas = collections.defaultdict(int) aa_count = 0 nlines = 0 for rec in seqs: rec = self.f(rec) nlines = nlines + 1 aa_count = aa_count + len(str(rec)) x = ProteinAnalysis(str(rec)) for aa, count in x.count_amino_acids().items(): all_aas[aa] += count if aa_count < 1: return { 'A': 1, 'R': 1, 'N': 1, 'D': 1, 'C': 1, 'Q': 1, 'E': 1, 'G': 1, 'H': 1, 'I': 1, 'L': 1, 'K': 1, 'M': 1, 'F': 1, 'P': 1, 'S': 1, 'T': 1, 'W': 1, 'Y': 1, 'V': 1 } aa_countsdict = {} for aa in all_aas: count = round(all_aas[aa] / aa_count, 3) aa_countsdict[aa] = count return aa_countsdict
def bio_feat(record): clean_seq = str(MutableSeq(record.seq)).replace("X", "") clean_seq = clean_seq.replace("U", "C") clean_seq = clean_seq.replace("B", "N") clean_seq = clean_seq.replace('Z', 'Q') clean_seq = MutableSeq(clean_seq).toseq() ### features seq_length = len(str(clean_seq)) analysed_seq = ProteinAnalysis(str(clean_seq)) molecular_weight = analysed_seq.molecular_weight() amino_percent = analysed_seq.get_amino_acids_percent().values() isoelectric_points = analysed_seq.isoelectric_point() count = analysed_seq.count_amino_acids().values() # aromaticity = analysed_seq.aromaticity() instability_index = analysed_seq.instability_index() # hydrophobicity = analysed_seq.protein_scale(ProtParamData.kd, 5, 0.4) secondary_structure_fraction = analysed_seq.secondary_structure_fraction() return np.array([seq_length, molecular_weight, isoelectric_points, instability_index] + list(secondary_structure_fraction) + list(count) + list(amino_percent))
def protein_analysis(): if session.username == None: redirect(URL(r=request,f='../account/log_in')) from Bio.SeqUtils.ProtParam import ProteinAnalysis form = FORM(TABLE( TR("Amino acid sequence: ", TEXTAREA(_type="text", _name="sequence", requires=IS_NOT_EMPTY())), INPUT(_type="submit", _value="SUBMIT"))) if form.accepts(request.vars,session): session['sequence'] = seqClean(form.vars.sequence.upper()) X = ProteinAnalysis(session['sequence']) session['aa_count'] = X.count_amino_acids() session['percent_aa'] = X.get_amino_acids_percent() session['mw'] = X.molecular_weight() session['aromaticity'] = X.aromaticity() session['instability'] = X.instability_index() session['flexibility'] = X.flexibility() session['pI'] = X.isoelectric_point() session['sec_struct'] = X.secondary_structure_fraction() redirect(URL(r=request, f='protein_analysis_output')) return dict(form=form)
def get_features(seq): """get global features from a protein sequence Parameters ---------- seq : str protein sequence Return ---------- dictionary: global features of the protein sequence """ features = {} features['undefined_count'] = len([x for x in seq if x in ['X','B','Z',"'",'O','U']]) features['length'] = len(seq) features['perc_undefined_count'] = features['undefined_count']/features['length'] features['entropy'] = entropy(seq) features['ideal_entropy'] = entropy_ideal(len(seq)) features['perc_entropy'] = features['entropy']/features['ideal_entropy'] features['hydr_count'] = sum(1 for x in seq if x in hydrophobic_proteins) features['polar_count'] = sum(1 for x in seq if x in polar_proteins) features['buried'] = sum(buried[x] for x in seq if x in hydrophobic_proteins) seq = ''.join([x for x in seq if x not in ['X','B','Z',"'",'O','U']]) protein = ProteinAnalysis(seq) features['gravy'] = protein.gravy() features['molecular_weight'] = protein.molecular_weight() features['aromaticity'] = protein.aromaticity() features['instability_index'] = protein.instability_index() features['isoelectric_point'] = protein.isoelectric_point() features['helix'], features['turn'], features['sheet'] = protein.secondary_structure_fraction() features.update(protein.count_amino_acids()) # features.update(protein.get_amino_acids_percent()) return features
def GetFeatures (My_seq): Features = {} ProteinAnalysis(My_seq) analysed_seq = ProteinAnalysis(My_seq) #Caracteristicas monovaloradas Features["Molecular_weight"] = analysed_seq.molecular_weight() Features["Aromaticity"] = analysed_seq.aromaticity() Features["Instability_index"] = analysed_seq.instability_index() Features["Isoelectric_point"] = analysed_seq.isoelectric_point() #Caracteristicas multivaloradas Features["Flexibility"] = analysed_seq.flexibility() # List 580 Features["Second_structure_fraction"] = analysed_seq.secondary_structure_fraction() #3 Tupla Features["Count_amino_acids"] = analysed_seq.count_amino_acids() #20 Dict Features["Amino_acids_percent"] = analysed_seq.get_amino_acids_percent() #20 Dict return Features
def on_enter(self, *args): #what happens as you enter screen #3 sequence_identity = ObjectProperty(None) # reads the no_header_sequence.txt file to calculate Mw in kDa noHeader = open("no_header_sequence.txt").read() print("noHeader: ", noHeader) analysed_seq = ProteinAnalysis(noHeader) Mw = analysed_seq.molecular_weight() # Mw g/mol Mw_kDa = round(Mw / 1000, 3) # Mw kDa print(analysed_seq.count_amino_acids() ) # Dictionary with count for each amino acid heaviness = str(Mw_kDa) + " kDa" self.weight.text = heaviness # updates protein weight in kDa on the screen statinfo = os.stat('my_blast.xml') size = statinfo.st_size if size == 0: #if no xml file created sequence_identity = "BLAST search failed.\nCheck your FASTA file and try again." else: result_handle = open("my_blast.xml") blast_record = NCBIXML.read(result_handle) counter = 1 for alignment in blast_record.alignments: for hsp in alignment.hsps: if counter < 2: #takes only the first result sequence_identity = alignment.hit_def print("hit_def:", alignment.hit_def) title_split = sequence_identity.split('>') reduced_title = title_split[0] print(title_split[0]) counter = counter + 1 self.protname.text = reduced_title #updates sequence identity on the app screen
global_counts_feats = pd.concat([d[str(i)] for i in range(len(d))],axis=1) global_counts_feats = global_counts_feats.fillna(0) global_counts_feats = global_counts_feats.T # In[525]: # Find the best split for local amino count aminoFirstCount=[] aminoLastCount=[] cnt = np.arange(10,60,10) for j in cnt: for i in range(len(sequences)): X=ProteinAnalysis(str(sequences[i][j:])) aminoFirstCount.append(X.count_amino_acids()) for j in cnt: for i in range(len(sequences)): X=ProteinAnalysis(str(sequences[i][:j])) aminoLastCount.append(X.count_amino_acids()) aminofirstchunk = [aminoFirstCount[i:i+len(sequences)] for i in range(0, len(aminoFirstCount), len(sequences))] aminofirst10 = pd.DataFrame(aminofirstchunk[0]) aminofirst10.columns = [str(cols)+'_first' for cols in aminofirst10.columns] aminofirst20 = pd.DataFrame(aminofirstchunk[1]) aminofirst20.columns = [str(cols)+'_first' for cols in aminofirst20.columns] aminofirst30 = pd.DataFrame(aminofirstchunk[2]) aminofirst30.columns = [str(cols)+'_first' for cols in aminofirst30.columns] aminofirst40 = pd.DataFrame(aminofirstchunk[3]) aminofirst40.columns = [str(cols)+'_first' for cols in aminofirst40.columns] aminofirst50 = pd.DataFrame(aminofirstchunk[4])
sequence = str(record.seq).replace('X', 'G') protein = ProteinAnalysis(str(sequence)) p_len.append(len(sequence)) mol_w.append(protein.molecular_weight()) iso_p.append(protein.isoelectric_point()) smell.append(protein.aromaticity()) taste_factor.append(protein.gravy()) insta_ind.append(protein.instability_index()) char_at_acid.append(protein.charge_at_pH(1)) char_at_neutral.append(protein.charge_at_pH(7)) char_at_base.append(protein.charge_at_pH(14)) helter_skeler.append(protein.secondary_structure_fraction()[0]) turnip.append(protein.secondary_structure_fraction()[1]) garfield.append(protein.secondary_structure_fraction()[2]) for x in amino_acids: n = protein.count_amino_acids()[x] for y in d_count.keys(): if y[-1] == x: d_count[y].append(n) for a in amino_acids: m = protein.get_amino_acids_percent()[a] for b in d_perc.keys(): if b[-1] == a: d_perc[b].append(m) #areas = get_area_classes(test_pdb) #polar_area.append(areas[0]) #apolar_area.append(areas[1]) #total_area.append(areas[2]) print('done') for values_count in d_count.values():
# Countthe amino acids for a FASTA file containing numerous sequences #to keep the program for general use, argparse and sys used to allow the input pile to be user defined through the command line import argparse import sys parser = argparse.ArgumentParser(description='Calculate mw and pi for protein sequences.') parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),default=sys.stdin) args = parser.parse_args() #to read from a FASTA file with a loop over entries using SeqIO define the FASTA sequences and analyse them by ProteinAnalysis #display the sequence names, molecular weight and isoelectric point from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.Seq import Seq from Bio.Alphabet import IUPAC from Bio import SeqIO for record in SeqIO.parse(args.infile, "fasta"): seq = str(record.seq) # my_c = Seq(seq) my_prot = ProteinAnalysis(seq) aa_counts = my_prot.count_amino_acids() c_counts = aa_counts['C'] print '{}\t {}'.format(record.id,c_counts) #print '{}\t {}'.format(record.id, my_c.count("C"))
from Bio.SeqUtils.ProtParam import ProteinAnalysis my_seq = str(input("manual sequence from translate.py :")) analysed_seq = ProteinAnalysis(my_seq) answer1 = str(input("detect molecular weight y/n? :")) if answer1 == "y": mweight = analysed_seq.molecular_weight() print(mweight) answer2 = str(input("detect gravy y/n? :")) if answer2 == "y": gravy_protein = analysed_seq.gravy() print(gravy_protein) print(analysed_seq.count_amino_acids()) input("enter")
def get_sequence_count_aminoacids(self): x = ProteinAnalysis(self.sequence) return x.count_amino_acids() # how to draw histogram
class Peptide(PolyIon): """Peptide represents single protein chains in solution. Peptides properties are based entirely on analysis of the sequence of the peptide. """ _state = {'name': 'Name of the peptide.', 'sequence': 'Amino acid sequence of the peptide.' } _sequence = None _analysis = None # TODO: move h to function or constants. Unify with pitts? _h_max = 1 _h_min = 2./3. _h = 5./6. def __init__(self, name=None, sequence=None): self._name = name self._sequence = sequence self._analysis = ProteinAnalysis(str(self.sequence)) @property def molecular_weight(self): return SeqUtils.molecular_weight(self.sequence, 'protein') def charge(self, pH=None, ionic_strength=None, temperature=None, moment=1): """Return the time-averaged charge of the peptide. :param pH :param ionic_strength :param temperature """ pH, ionic_strength, temperature = \ self._resolve_context(pH, ionic_strength, temperature) amino_acid_count = self._analysis.count_amino_acids() pos_pKs = dict(positive_pKs) neg_pKs = dict(negative_pKs) nterm = self.sequence[0] cterm = self.sequence[-1] if nterm in pKnterminal: pos_pKs['Nterm'] = pKnterminal[nterm] if cterm in pKcterminal: neg_pKs['Cterm'] = pKcterminal[cterm] charge = IsoelectricPoint(self.sequence, amino_acid_count)._chargeR(pH, pos_pKs, neg_pKs) return charge**moment def isoelectric_point(self, ionic_strength=None, temperature=None): """Return the isoelectric point of the peptide.""" # _, ionic_strength, temperature = \ # self._resolve_context(None, ionic_strength, temperature) return self._analysis.isoelectric_point() def volume(self): """Return the approximate volume of the folded peptide in m^3.""" v = self.molecular_weight / avogadro / self.density() / lpm3 / gpkg return v def radius(self): """Return the approximate radius of the folded peptide in m.""" return (self.volume() * 3. / 4. / pi) ** (1. / 3.) def density(self): """Return the approximate density of the folded peptide in kg/L.""" return 1.410 + 0.145 * exp(-self.molecular_weight / 13.) def mobility(self, pH=None, ionic_strength=None, temperature=None): """Return the effective mobility of the ion in m^2/V/s. If a context solution is available, mobility uses the full Onsager-Fuoss correction to mobility. Otherwise, the Robinson-Stokes model is used. :param pH :param ionic_strength :param temperature """ pH, ionic_strength, temperature = \ self._resolve_context(pH, ionic_strength, temperature) mobility = self.charge(pH) * elementary_charge /\ (6 * pi * self._solvent.viscosity(temperature) * self.radius() * (1 + self.radius() / self._solvent.debye(ionic_strength, temperature) ) ) * self._h return mobility
Y = {'C':9, 'H':11,'N':1, 'O':3, 'S':0} V = {'C':5, 'H':11,'N':1, 'O':2, 'S':0} dictOfAmino = {'A':A,'R':R,'N':N,'D':D,'C':C,'Q':Q, 'E':E, 'G':G,'H':H,'I':I,'L':L,'K':K,'M':M,'F':F,'P':P,'S':S,'T':T,'W':W,'Y':Y,'V':V} print "Note output file is appended if same file is selected twice molecular formulas \n for both runs will be present in output file" fileName = raw_input("Protein FASTA file to generate molecular formulas for: ") outFileName = raw_input("Output file name (include .txt): ") fasta_file = open(fileName, "rU") for record in SeqIO.parse(fasta_file, "fasta"): myseq = str(record.seq) analysis = ProteinAnalysis(myseq) listofaminoacids.append(analysis.count_amino_acids()) for i in listofaminoacids: carbonTotal = 0 hydrogenTotal = 0 oxygenTotal = 0 nitrogenTotal = 0 sulfurTotal = 0 peptideBonds = 0 for value in i: for amino in dictOfAmino: if value == amino:
"," + str(mol_w) + "," + str(ins) + "," + str(cnt) + "\n") else: with open(path_ + "\\data\\output\\svm_out.txt", "a+") as s: s.write("-1 " + ' '.join("{}:{}".format(k, v) for k, v in a.items()) + "\n") with open(pth + "weka_output.arff", "a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values())) + " loc\n") with open(pth + "tain_DL.csv", "a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values())) + str(round(aromat, 3)) + "," + str(round(fraction[0], 3)) + "," + str(round(fraction[1], 3)) + "," + str(round(fraction[2], 3)) + "," + str(round(iso, 3)) + "," + str(mol_w) + "," + str(ins) + "," + "0" + "\n") for seq, cl in zip(seq_list, cls_list): # main loop to extract the features _ = ProteinAnalysis(seq) # Biopython protein analysis package aa_count = (_.count_amino_acids()) # amino acid count aromat, fraction, iso = _.aromaticity(), _.secondary_structure_fraction( ), _.isoelectric_point() try: mol_w, ins = ("%0.2f" % _.molecular_weight()), ("%0.2f" % _.instability_index()) except Exception: mol_w, ins = mol_w, ins # aromaticity, sec_strucure_fraction, iso_electric point , molecular weight, instability index format_output(aa_count, cl)
from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.SeqUtils import ProtParamData from Bio import SeqIO with open('../../samples/pdbaa') as fh: for rec in SeqIO.parse(fh,'fasta'): myprot = ProteinAnalysis(str(rec.seq)) print(myprot.count_amino_acids()) print(myprot.get_amino_acids_percent()) print(myprot.molecular_weight()) print(myprot.aromaticity()) print(myprot.instability_index()) print(myprot.flexibility()) print(myprot.isoelectric_point()) print(myprot.secondary_structure_fraction()) print(myprot.protein_scale(ProtParamData.kd, 9, .4))
def openfile(): global prob, probab, te global my_seq global anti global structure, structure_id, filename global antigenicity, hydro, flex, sec global m, a, c, b, length, j, k global hydroph, flexi, access anti = [] sec = [] probab = [] from tkinter import filedialog root = Tk() root.filename = filedialog.askopenfilename( initialdir="/", title="Select file", filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb"))) filename = root.filename print(filename) structure_id = "1e6j" structure = PDBParser().get_structure(structure_id, root.filename) ppb = PPBuilder() for pp in ppb.build_peptides(structure): my_seq = pp.get_sequence() # type: Seq print(my_seq) for model in structure: for chain in model: print(chain) sequence = list(my_seq) m = ''.join(sequence) print(m) length = len(m) # type: int print("Sequence consist of", length, "Amino Acids") from Bio.SeqUtils.ProtParam import ProteinAnalysis analysed_seq = ProteinAnalysis(m) print("Molecular weight = ", analysed_seq.molecular_weight()) print("Amino Acid Count = ", analysed_seq.count_amino_acids()) print("Secondary structure fraction =", analysed_seq.secondary_structure_fraction()) kd = { 'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2 } c = list(analysed_seq.flexibility()) b = list(analysed_seq.protein_scale(kd, 10, 1.0)) hydro = list(analysed_seq.protein_scale(kd, 10, 1.0)) flex = list(analysed_seq.flexibility()) hydroph = list(analysed_seq.protein_scale(kd, 10, 1.0)) flexi = list(analysed_seq.flexibility()) i = 1 j = -1 # type: int k = 9 while i <= (length - 10): print("Sequence is = ", m[j + 1:k + 1]) print("Flexibility value = ", c[j + 1]) print("Hydrophilicity value = ", b[j + 1]) ana_seq = ''.join(m[j + 1:k + 1]) analyze_seq = ProteinAnalysis(ana_seq) # For Secondary structure Analysis print("Secondary structure fraction =", analyze_seq.secondary_structure_fraction()) a = list(analyze_seq.secondary_structure_fraction()) a = a[0] sec.append(a) i += 1 j += 1 k += 1 f = length r = 1 y = 10 global acc, logacc acc = [] for i in range(0, f): str1 = "accessibility, resi " str2 = str(r) + "-" + str(y) saving = str1 + str2 print(saving) r = r + 1 y = y + 1 structure = freesasa.Structure("1e6j.pdb") resulta = freesasa.calc(structure) area_classes = freesasa.classifyResults(resulta, structure) print("Total : %.2f A2" % resulta.totalArea()) for key in area_classes: print(key, ": %.2f A2" % area_classes[key]) resulta = freesasa.calc( structure, freesasa.Parameters({ 'algorithm': freesasa.LeeRichards, 'n-slices': 10 })) selections = freesasa.selectArea(('alanine, resn ala', saving), structure, resulta) for key in selections: print(key, ": %.2f A2" % selections[key]) a = selections[key] acc.append(a) l = acc[0::2] access = l print(acc) print(l) logacc = [math.log(y, 10) for y in l] print(logacc)
file_name=argv[1].split(".rtf")[0] cmd = "textutil -convert txt " + argv[1] call(['/bin/zsh','-i','-c',cmd]) string = file_name + ".txt" infile=open(string,'r') ofile_str = argv[1] + "_params.csv" call(["rm",ofile_str]) ofile = open(ofile_str,'w') ofile.write("name,MW,EC,EC/MW\n") for line in infile: if re.search('^[0-9]+\.', line): name = '.'.join(line.strip().split('.')[1:]) if re.search('^[A-Z]{20}', line): my_seq = line.strip().strip( '\*' ) analysed_seq = ProteinAnalysis(my_seq) MW = analysed_seq.molecular_weight() W = analysed_seq.count_amino_acids()['W'] Y = analysed_seq.count_amino_acids()['Y'] C = analysed_seq.count_amino_acids()['C'] EC = Y*1490 + W*5500 + C*125 EC_MW = EC / MW ofile.write( name + "," + str(MW) + "," + str(EC) + "," + str(EC_MW) + '\n' ) print name + " " + str(MW) + " " + str(EC) + " " + str(EC_MW) ofile.close() call(["open",ofile_str]) exit
from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.SeqUtils import ProtParamData import sys import json inp = json.loads(sys.argv[1]) seq = inp["Sequence"] X = ProteinAnalysis(seq) data = dict() if "MW" in inp["Options"]: data["MW"] = X.molecular_weight() if "EC280" in inp["Options"]: aa_count = X.count_amino_acids() if "hasDisulfide" in inp["Options"]: data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"] + 62.5 * aa_count["C"] else: data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"] if "PI" in inp["Options"]: data["PI"] = X.isoelectric_point() if "AACont" in inp["Options"]: ratios = X.get_amino_acids_percent() data["AACont"] = {aa: ratios[aa] * 100. for aa in ratios} print json.dumps(data)
def openfile(): global my_seq global antigenicity global m, a, c, b from tkinter import filedialog root = Tk() root.filename = filedialog.askopenfilename( initialdir="/", title="Select file", filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb"))) print(root.filename) structure_id = "1e6j" structure = PDBParser().get_structure(structure_id, root.filename) ppb = PPBuilder() for pp in ppb.build_peptides(structure): my_seq = pp.get_sequence() # type: Seq print(my_seq) for model in structure: for chain in model: print(chain) sequence = list(my_seq) m = ''.join(sequence) # type: str print(m) length = len(m) # type: int print(length) print("Sequence consist of", len(m), "Amino Acids") from Bio.SeqUtils.ProtParam import ProteinAnalysis analysed_seq = ProteinAnalysis(m) print("Molecular weight = ", analysed_seq.molecular_weight()) print("Amino Acid Count = ", analysed_seq.count_amino_acids()) print("Secondary structure fraction =", analysed_seq.secondary_structure_fraction()) kd = { 'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2 } c = list(analysed_seq.flexibility()) b = list(analysed_seq.protein_scale(kd, 10, 1.0)) i = 1 j = -1 # type: int k = 9 while i <= (length - 10): print("Sequence is = ", m[j + 1:k + 1]) print("Flexibility value = ", c[j + 1]) print("Hydrophilicity value = ", b[j + 1]) ana_seq = ''.join(m[j + 1:k + 1]) analyze_seq = ProteinAnalysis(ana_seq) # For Secondary structure Analysis print("Secondary structure fraction =", analyze_seq.secondary_structure_fraction()) a = list(analyze_seq.secondary_structure_fraction()) global tupleall tupleall = (m[j + 1:k + 1], c[j + 1], b[j + 1], a) print(tupleall[0], tupleall[2], tupleall[1], tupleall[3]) i = i + 1 if a[0] >= a[1]: a[0] = 1 else: a[0] = a[1] # For Hydrophilicity if b[j + 1] > 0.5: b[j + 1] = 2 elif b[j + 1] < 0.5 or b[j + 1] > 0: b[j + 1] = 1 elif b[j + 1] > 0 or b[j + 1] > -0.4: b[j + 1] = -1 elif b[j + 1] < -0.4: b[j + 1] = -2 else: b[j + 1] = 0 # For Flexibility if c[j + 1] > 1.0: c[j + 1] = 1 else: c[j + 1] = 0 # For antigenicity Index antigenicity = 0.3 * b[j + 1] + 0.15 * 1 + 0.15 * c[j + 1] + 0.2 * a[0] print("antigenicity", antigenicity) j += 1 k += 1