def protein_analysis(): if session.username == None: redirect(URL(r=request, c='account', f='log_in')) from Bio.SeqUtils.ProtParam import ProteinAnalysis form = FORM( TABLE( TR( "Amino acid sequence: ", TEXTAREA(_type="text", _name="sequence", requires=IS_NOT_EMPTY())), INPUT(_type="submit", _value="SUBMIT"))) if form.accepts(request.vars, session): session['sequence'] = seqClean(form.vars.sequence.upper()) X = ProteinAnalysis(session['sequence']) session['aa_count'] = X.count_amino_acids() session['percent_aa'] = X.get_amino_acids_percent() session['mw'] = X.molecular_weight() session['aromaticity'] = X.aromaticity() session['instability'] = X.instability_index() session['flexibility'] = X.flexibility() session['pI'] = X.isoelectric_point() session['sec_struct'] = X.secondary_structure_fraction() redirect(URL(r=request, f='protein_analysis_output')) return dict(form=form)
def prot_feats(filename): XX=[] ids=[] for rec in SeqIO.parse(filename, "fasta"): f=[] X = ProteinAnalysis(str(rec.seq)) # import pdb; pdb.set_trace() try: X.molecular_weight() #throws an error if 'X' in sequence. we skip such sequences f=list(prot_feats_seq(str(rec.seq))) # XX.append(f) ids.append(rec.id) except: # print ("exception") continue XX=np.array(XX) # import pdb; pdb.set_trace() return XX,ids
def binaryFeatureTable(PosSeqFiles, NegSeqFiles): seqDicts = [] #add sequences from each file in positive group sequenceClass = 1 for file in PosSeqFiles: records = readfasta(file) for rec in records: seqDict = ProteinAnalysis(str(rec.seq)).get_amino_acids_percent() seqDict['Class'] = sequenceClass seqDict['Length'] = len(rec.seq) seqDict['ID'] = rec.id seqDicts.append(seqDict) #add sequences from each file in negative group sequenceClass = 0 for file in NegSeqFiles: records = readfasta(file) for rec in records: seqDict = ProteinAnalysis(str(rec.seq)).get_amino_acids_percent() seqDict['Class'] = sequenceClass seqDict['Length'] = len(rec.seq) seqDict['ID'] = rec.id seqDicts.append(seqDict) return pd.DataFrame(seqDicts)
def find_gravy_stats(folders, outfile, condition, regex = None, frequency = False): mean_list = [] for folder in folders: with open(folder[0] + '/5_AA-sequences.txt') as f: gravy_all = 0 total_seqs = 0 reader = csv.DictReader(f, delimiter = '\t') for row in reader: try: if row['Functionality'] == 'productive' and condition(row['CDR3-IMGT']): protein = Prot(row['CDR3-IMGT']) gravy = protein.gravy() if frequency: pat = re.compile(regex) info = pat.match(row['Sequence ID']) freq = int(info.group(1)) else: freq = 1 total_seqs += freq gravy_all += gravy * freq except: pass try: mean_list.append(gravy_all/float(total_seqs)) print mean_list except: pass with open(outfile + '_means.txt', 'w') as out: for item in mean_list: out.write(str(item) +'\n') with open(outfile + '.txt', 'w') as out: out.write('mean CDR3 gravy,standard deviation\n') out.write(str(np.mean(mean_list)) + ',' + str(np.std(mean_list)))
def sample_protein(self): codons = len(self.parameters.b2c.codons) *[0] code = "" for tribase in self.tribases: bases = tribase.bases codon = [[0,0,0,0], [0,0,0,0], [0,0,0,0]] for i in range(len(bases)): base = bases[i] r = int(100*random())+1 cumsum = 0 for j in range(len(base)): cumsum += base[j] if(cumsum >= r): codon[i][j] = 1 break t = Tribase(codon, self.parameters.b2c) code += translate_triplets(codon) codons = [i + j for i, j in zip(codons, t.codons)] PA = ProteinAnalysis(translate(code)) gc = GC(code) try: w = PA.molecular_weight() except: w = 0 return codons, gc, w
def get_biopython_features(X): res = np.zeros((X.shape[0], 6)) for i,seq in enumerate(X): analysed_seq = ProteinAnalysis(seq) res[i] = np.array([analysed_seq.molecular_weight()]+[analysed_seq.instability_index()] + [analysed_seq.isoelectric_point()] + list(analysed_seq.secondary_structure_fraction())) return res
def test(self, positive_file, negative_file, sequence_position=10): # for my test files sequence position = 10 test_features = [] test_labels = [] with open(positive_file) as f: for i in f: if ">" not in i and i[sequence_position] == self.amino_acid: temp_window = ProteinAnalysis( windower(i, sequence_position, self.window).strip("\t")) feat = featurify(temp_window, (2 * self.window + 1)) test_features.append(feat) test_labels.append(1) with open(negative_file) as f: for i in f: if ">" not in i and i[ sequence_position] == self.amino_acid and "X" not in i and "U" not in i: temp_window = ProteinAnalysis( windower(i, sequence_position, self.window).strip("\t")) feat = featurify(temp_window, (2 * self.window + 1)) test_features.append(feat) test_labels.append(0) temp = list(zip(test_features, test_labels)) random.shuffle(temp) test_features, test_labels = zip(*temp) test_results = self.clf.predict(test_features) #print("cross val"+str(cross_val_score(self.clf, test_features, test_labels, cv=5))) report(results=test_results, answers=test_labels, classy=self.clf)
def get_protein_features(seq): seq = correct(seq) prot_analysis = ProteinAnalysis(seq) prot_weight = molecular_weight(seq) pI = prot_analysis.isoelectric_point() aa_count = prot_analysis.count_amino_acids() neg_charged_residues = aa_count['D'] + aa_count['E'] pos_charged_residues = aa_count['K'] + aa_count['R'] extinction_coefficient_1 = aa_count['Y'] * 1490 + aa_count['W'] * 5500 extinction_coefficient_2 = aa_count['Y'] * 1490 + aa_count[ 'W'] * 5500 + aa_count['C'] * 125 instability_idx = instability_index(seq) gravy = hydrophobicity(seq) secondary_structure_fraction = [ frac for frac in prot_analysis.secondary_structure_fraction() ] names = [ 'length', 'weight', 'pI', 'neg_charged_residues', 'pos_charged_residues', 'extinction_coeff1', 'extinction_coeff2', 'instability_index', 'gravy', 'helix', 'turn', 'sheet' ] return names, [ len(seq), prot_weight, pI, neg_charged_residues, pos_charged_residues, extinction_coefficient_1, extinction_coefficient_2, instability_idx, gravy, *secondary_structure_fraction ]
def feat_extract(sequences): list_dict_feat = [] for sequence in sequences: protein = ProteinAnalysis(sequence) sequence_feat = defaultdict(float) sequence_len = len(sequence) sequence_feat["sequence_length"] = sequence_len sequence_feat["aromaticty"] = protein.aromaticity() sequence_feat["isoeletric_point"] = protein.isoelectric_point() #sequence_feat["flexibility"] = protein.flexibility() if ('X' not in sequence) and ('O' not in sequence) and ( 'U' not in sequence) and ('B' not in sequence): sequence_feat["molecular_weight"] = protein.molecular_weight() for letter in sequence: sequence_feat["relative_fre_{}".format(letter)] += 1 / sequence_len for property in dic_properties: if letter in dic_properties[property]: sequence_feat['freq_{}'.format(property)] += 1 for letter in sequence[0:50]: sequence_feat["relative_fre_start{}".format(letter)] += 1 / 50 for letter in sequence[-51:-1]: sequence_feat["relative_fre_end{}".format(letter)] += 1 / 50 list_dict_feat.append(sequence_feat) return list_dict_feat
def calc_isoelectric_point(self) -> float: """ using biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam-pysrc.html :return: calculates the sequence's isoelectric point """ protein_analysis = ProteinAnalysis(self.get_seq()) return protein_analysis.isoelectric_point()
def get_secondary_structure(self): x = ProteinAnalysis(self.sequence) sec_stru = x.secondary_structure_fraction() helix = "{0:0.2f}".format(sec_stru[0]) turn = "{0:0.2f}".format(sec_stru[1]) sheet = "{0:0.2f}".format(sec_stru[2]) return helix, turn, sheet
def _toPeptide(sequence, molecule, genetic_code=1, to_stop=True): ''' Private function - Takes a sequence (DNA/RNA/amino acid) and process it according to return a ProteinAnalysis object. @param sequence String: Nucleotide (DNA/RNA) or amino acid sequence. @param molecule String: Defines the type of molecule. Three options are allowed: 'peptide' for amino acid sequences, 'DNA' for DNA sequences (requires transcription and translation), and 'RNA' for RNA sequence (requires translation). @param genetic_code Integer: Genetic code number to be used for translation. Default = 1 (Standard Code). For more information, see <https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi> @param to_stop Boolean: Flag to stop translation when first stop codon is encountered. Default = True. @return: Bio.SeqUtils.ProtParam.ProteinAnalysis object ''' if molecule.lower() == 'peptide': peptide = ProteinAnalysis(sequence) elif molecule.lower() == 'rna': rna = str(sequence) rna = Seq(rna, generic_rna) peptide = rna.translate(genetic_code, to_stop=to_stop) peptide = ProteinAnalysis(str(peptide)) elif molecule.lower() == 'dna': dna = str(sequence) dna = Seq(dna, generic_dna) rna = dna.transcribe() peptide = rna.translate(genetic_code, to_stop=to_stop) peptide = ProteinAnalysis(str(peptide)) return peptide
def print_features(fasta_file, data_dict, annot): if annot=="coding": annot=1 elif annot=="noncoding": annot=0 for seq in SeqIO.parse(fasta_file,"fasta"): seqid = seq.id seqDNA=seq.seq seqDNA=seqDNA.upper() seqlen=len(seqDNA) seqCDS,orf_integrity = FindCDS(seqDNA).longest_orf() # seqProt=PA(str(Seq(seqCDS).translate().strip("*"))) Prot=PA(str(seqCDS.translate().strip("*"))) seqProt=Prot.sequence orflen=len(seqProt) if len(seqProt)> 0: isoelectric_point = Prot.isoelectric_point() else: isoelectric_point = 0.0 gc=(seqDNA.count("G")+seqDNA.count("C"))*100.0/len(seqDNA) data_dict["readID"].append(seqid) data_dict["class"].append(annot) data_dict["len"].append(seqlen) data_dict["orflen"].append(orflen) data_dict["pI"].append(isoelectric_point) data_dict["GC%"].append(gc) return data_dict
def my_own_filtering(input_file, output_file, filt_gc=45, filt_arom=0.01): sequences = {} c = 0 with open(input_file, "r") as content: for record in SeqIO.parse(content, "fasta"): c += 1 # calculate GC content using Bio calc_gc = SeqUtils.GC(record.seq) # calculate aromaticity using Bio prot_seq = record.seq.translate() X = ProteinAnalysis(str(prot_seq)) calc_arom = X.aromaticity() # so, now you can filter if calc_gc >= filt_gc and calc_arom >= filt_arom: sequences[record.id] = record.se # write a new fasta file with aminoacids records = [] for seq_id, seq in sequences.items(): records.append(SeqRecord(seq.translate(), id=seq_id, description="")) write_file = open('my_fasta', 'w') SeqIO.write(records, write_file, 'fasta') write_file.close() # print the percentage print(len(records) / c)
def protparams(aa_seq, vstarts, vstops): """Compute a set of parameters for a polypepeptide, which would helps assess the potenial of this peptide as a crystalization candidate. """ MWs = [] pIs = [] epsilons = [] for start in vstarts: for stop in vstops: if int(start) < int(stop): params = PA(aa_seq[int(start):int(stop)] ) # works with string or Seq objects MW = params.molecular_weight() MW = round(MW / 1000, 1) # in kiloDalton, rounded to 1 decimal pI = round(params.isoelectric_point(), 1) # To calculate the epsilon, we use this formula from protparam (web.expasy.org/protparam) # Epsilon (Prot) = N(Tyr)*Ext(Tyr) + N(Trp)*Ext(Trp) + N(Cystine)*Ext(Cystine) / MW in Dalton aa_dict = params.count_amino_acids( ) # returns a dict {'aa' : count } where aa is one letter code for the aminoacid epsilon = round((aa_dict['Y'] * 1490 + aa_dict['W'] * 5500 + aa_dict['C'] * 125) / (MW * 1000), 2) MWs.append(MW) pIs.append(pI) epsilons.append(epsilon) return MWs, pIs, epsilons
def sequence_vector(temp_window: str, window: int = 6, chemical=1): """ This vector takes the sequence and has each amino acid represented by an int 0 represents nonstandard amino acids or as fluff for tails/heads of sequences Strip is a list which can be modified as user needs call for """ temp_window = clean(temp_window) temp_window = windower(sequence=temp_window, position=int(len(temp_window)*.5), wing_size=window) vec = [] aa = {"G": 1, "A": 2, "L": 3, "M": 4, "F": 5, "W": 6, "K": 7, "Q": 8, "E": 9, "S": 10, "P": 11, "V": 12, "I": 13, "C": 14, "Y": 15, "H": 16, "R": 17, "N": 18, "D": 19, "T": 20, "X": 0} for i in temp_window: vec.append(aa[i]) if len(vec) != (window*2)+1: t = len(vec) for i in range((window*2)+1-t): vec.append(0) # Hydrophobicity is optional if chemical == 1: s = ProteinAnalysis(temp_window) vec.append(s.gravy()) vec.append(s.instability_index()) vec.append(s.aromaticity()) return vec
def calculate_residue_features(temp_dict, sequence): analyzed_seq = ProteinAnalysis(sequence) aa_percent = analyzed_seq.get_amino_acids_percent() hydrophobicity = 0 hydrophilicity = 0 interior__surface_transfer_energy_scale = 0 surface_fractional_probability = 0 for key in aa_percent.keys(): hydrophobicity += aa_percent[key] * kd[key] hydrophilicity += aa_percent[key] * hw[key] surface_fractional_probability += aa_percent[key] * em[key] interior__surface_transfer_energy_scale += aa_percent[key] * ja[key] temp_dict.update({ "Hydrophobicity": hydrophobicity, "Hydrophilicity": hydrophilicity, "Surface Fractional Probability": surface_fractional_probability, "I2S Transfer Energy Scale": interior__surface_transfer_energy_scale }) temp_dict.update(aa_percent)
def analyzeCleaves(self): #i used to iterate through cleave sites #j used to iterate for miss cleaves. Skips j cleave site(s) when calculating the peptide from cleave sites for i in range(len(self.sites)): end = False for j in range(self.misses+1): l = self.peptide[:self.sites[i]+1] try: r = self.peptide[self.sites[i+j+1]+1:] dp = self.peptide[self.sites[i]+1:self.sites[i+j+1]+1] except IndexError: #When code reaches this block, it means the end of the input string has been found #Set end to true to stop going through missed cleaves, no more exist r = '' dp = self.peptide[self.sites[i]+1:] end = True if i == 0: l = self.peptide[:self.sites[i+j]+1] if self.checkLenWeight(l): self.dpeps.append([l,len(l),ProteinAnalysis(str(l)).molecular_weight(),j,'',dp+r,str(1)+'-'+str(len(l))]) if self.checkLenWeight(dp): self.dpeps.append([dp,len(dp),ProteinAnalysis(str(dp)).molecular_weight(),j,l,r,str(self.sites[i]+2)+'-'+str(self.sites[i]+len(dp)+1)]) if end: break
def normal_charge_properties(self): df = pd.read_csv(self.train_fpi, sep='\t', index_col=0) df = df[df['y'] == 0] seqs = list(df['Sequence']) all_deltas = [] net_charges = [] frac_charges = [] all_seq_in = '' for seq in seqs: ms = motif_seq.LcSeq(seq, self.k, self.lca, 'lca') in_seq, out_seq = ms.seq_in_motif() in_kmer, out_kmer = ms.overlapping_kmer_in_motif() if len(in_kmer) > 20: ka = kappa.KappaKmers(out_kmer, out_seq) delta = ka.deltaForm() if ka.NCPR() > -0.1 and ka.NCPR() < 0.1: if delta < 0.1: ns = norm_score.NormScore() score = ns.lc_norm_score([seq])[0] if score > 20: if ka.FCR() < 0.2: all_seq_in += in_seq analysed_seq = ProteinAnalysis(all_seq_in) aa_perc = analysed_seq.get_amino_acids_percent() print(aa_perc)
def aa_comp_calc(): peptides = [ 'A', 'G', 'P', 'S', 'T', 'C', 'F', 'W', 'Y', 'H', 'R', 'K', 'M', 'I', 'L', 'V', 'N', 'D', 'E', 'Q' ] if not os.path.isdir(args.output): os.mkdir(args.output) with open(args.input, 'r') as infile, open(f'{args.output}/aa_comp.tsv', 'w') as outfile: outfile.write('Taxon\t' + '\t'.join(peptides) + '\n') # Reads in input file for record in SeqIO.parse(infile, format=args.in_format): outfile.write(f'{record.id}\t') analysed_seq = ProteinAnalysis(str(record.seq)) count_dict = analysed_seq.count_amino_acids() length = len( str(record.seq).replace("-", "").replace("X", "").replace("*", "")) out_str = '' # Loops through peptides and checks to see if it is in count_dict for pep in peptides: if pep in count_dict.keys(): out_str += f'{float(count_dict[pep]) / length}\t' else: out_str += '0\t' outfile.write(out_str.strip() + '\n')
def prot_feats_seq(seq): aa=['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] f=[] X = ProteinAnalysis(str(seq)) X.molecular_weight() #throws an error if 'X' in sequence. we skip such sequences p=X.get_amino_acids_percent() dp=[] for a in aa: dp.append(p[a]) dp=np.array(dp) dp=normalize(np.atleast_2d(dp), norm='l2', copy=True, axis=1, return_norm=False) f.extend(dp[0]) tm=np.array(twomerFromSeq(str(seq))) tm=normalize(np.atleast_2d(tm), norm='l2', copy=True, axis=1,return_norm=False) f.extend(tm[0]) thm=np.array(threemerFromSeq(str(seq))) thm=normalize(np.atleast_2d(thm), norm='l2', copy=True, axis=1,return_norm=False) f.extend(thm[0]) return np.array(f)
def percentages_from_proteins(path): file=open(path) names_list=[] sequence_list=[] sources_list = [] desc_list = [] taxo_list = [] keyw_list = [] taxid_list = [] for record in parse(file, "genbank"): cdsnum=0 for feat in record.features: prot=record.seq analysed_seq = ProteinAnalysis(str(prot)) #creating another class ProteinAnalysis sequence_list.append(analysed_seq.get_amino_acids_percent()) #invoking method on this class, it returns a dictionary, we store it in the list names_list.append(str(record.name)+ "_CDS#" + str(cdsnum)) sources_list.append(record.annotations['source']) keyw_list.append(record.annotations['keywords']) taxo_list.append(record.annotations['taxonomy']) desc_list.append(record.description) taxid_list.append(record.annotations["organism"]) cdsnum+=1 #List of dictionaties to the numpy array aas = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] nseqs = len(sequence_list) percents=np.zeros((nseqs,20)) for i in range(nseqs): percdict = sequence_list[i] for an in range(20): percents[i,an]= percdict[ aas[an] ] return percents, names_list, sources_list, desc_list, taxo_list, keyw_list, taxid_list, sequence_list
def seq_properties(file_path): """Apply protein analysis on a fasta file to get analyzed amino acid profile Args: file_path [str]: File directory for the fasta file Returns: total_percent_dict [dict]: Amino acid with counts dict """ record = SeqIO.read(file_path, 'fasta') analyzed_seq = ProteinAnalysis(str(record.seq)) c = analyzed_seq.get_amino_acids_percent() acidic_percent = count_prop(c, acidic_aa) basic_percent = count_prop(c, basic_aa) hydroxylic_percent = count_prop(c, hydroxylic_aa) amidic_percent = count_prop(c, amidic_aa) aliphatic_percent = count_prop(c, aliphatic_aa) aromatic_percent = count_prop(c, aromatic_aa) total_percent_dict = { "Acidic": acidic_percent, "Basic": basic_percent, "Hydroxilic": hydroxylic_percent, "Amidic": amidic_percent, "Aliphatic": aliphatic_percent, "Aromatic": aromatic_percent } return total_percent_dict
def transform(self, X): vec = np.zeros((len(X), len(VALID_AMINO_ACIDS))) for i in range(len(X)): pa = ProteinAnalysis(str(X[i])) for j, a in enumerate(VALID_AMINO_ACIDS): vec[i, j] = pa.get_amino_acids_percent().get(a, 0.0) return vec
def aa_composition(seq): protein = ProteinAnalysis(seq) aa = protein.count_amino_acids() aacomp = 'A:\t%i,' % aa['A'] aacomp += 'C:\t%i,' % aa['C'] aacomp += 'E:\t%i,' % aa['E'] aacomp += 'D:\t%i,' % aa['D'] aacomp += 'G:\t%i,' % aa['G'] aacomp += 'F:\t%i,' % aa['F'] aacomp += 'I:\t%i,' % aa['I'] aacomp += 'H:\t%i,' % aa['H'] aacomp += 'K:\t%i,' % aa['K'] aacomp += 'M:\t%i,' % aa['M'] aacomp += 'L:\t%i,' % aa['L'] aacomp += 'N:\t%i,' % aa['N'] aacomp += 'Q:\t%i,' % aa['Q'] aacomp += 'P:\t%i,' % aa['P'] aacomp += 'S:\t%i,' % aa['S'] aacomp += 'R:\t%i,' % aa['R'] aacomp += 'T:\t%i,' % aa['T'] aacomp += 'W:\t%i,' % aa['W'] aacomp += 'V:\t%i,' % aa['V'] aacomp += 'Y:\t%i,' % aa['Y'] aacomp = aacomp.split(",") return aacomp
def getProps(f): """ Code for getting the molecular weight and other properties using Biopython """ L = myPDB.loader(f) aseq = ProteinAnalysis(L.seq) return aseq.molecular_weight(), np.max(aseq.flexibility()), np.sum(L.ASA)
def analyze(seq, name): analysed = ProteinAnalysis(seq) print(name) print("pI: ") print(analysed.isoelectric_point()) print("AA percent: ") print(analysed.get_amino_acids_percent())
def download(filelist: list, q: Queue, lock: Lock, cursor: sqlite3.Cursor, conn: sqlite3.Connection, dir_name: str): """ :param filelist: :param q: :param lock: :param cursor: :param conn: :param dir_name: """ with open('status_tmp.txt', 'w') as f: f.write('') for file in filelist: if file in open('status_tmp.txt').readlines(): continue pdbl = PDBList() pdbl.retrieve_pdb_file(file, pdir=os.path.join(dir_name, file), file_format='pdb') if not os.path.exists(os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file))): print("File with ID PDB: {:s} not found!".format(file)) continue parser = PDBParser() structure = parser.get_structure('{:s}', os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file))) name = parser.header.get('name', '') head = parser.header.get('head', '') method = parser.header.get('structure_method', '') res = parser.header.get('resolution', '') ncomp = 0 nchain = 0 eclist = [] for values in parser.header['compound'].values(): ncomp += 1 nchain += len(values['chain'].split(',')) eclist.append(values.get('ec', '') or values.get('ec_number', '')) ec = ", ".join(eclist) nres = 0 mmass = 0 ppb = PPBuilder() for pp in ppb.build_peptides(structure): seq = pp.get_sequence() nres += len(seq) seqan = ProteinAnalysis(str(seq)) mmass += int(seqan.molecular_weight()) lock.acquire() try: cursor.execute("""INSERT INTO Structures (IDPDB, NAME, HEAD, METHOD, RESOLUTION, NCOMP, NCHAIN, NRES, MMASS, EC) VALUES ("{:s}", "{:s}", "{:s}", "{:s}", {:.2f}, {:d}, {:d},{:d}, {:d}, "{:s}")""".format( file, name, head, method, res, ncomp, nchain, nres, mmass, ec)) except sqlite3.DatabaseError as err: print("Error: ", err) continue else: print("Download Done for ID PDB: {:s}".format(file)) conn.commit() q.put(file) finally: lock.release() with open('status_tmp.txt', 'at') as f: f.write((file + '\n')) os.remove('status_tmp.txt') q.put(None)
def get_gravy_list(self): gravy_list = [] for seq in self.df.index: # for every seq, add gravy to list seq = ProteinAnalysis(seq) gravy = "{:.6f}".format(seq.gravy()) gravy_list.append(gravy) gravy_list = np.array(gravy_list) # convert to np array return self.normalize(gravy_list) # return normalized
def get_aa_percentage_vectors(X): res = np.zeros((X.shape[0], 20)) for i, seq in enumerate(X): analysed_seq = ProteinAnalysis(seq) res[i] = pd.Series(analysed_seq.get_amino_acids_percent())[ aas # to ensure the same order every time just in case ].values return res
def test_alternative_weights(self): "Test Lanthipeptide.alt_weights" self.lant.core = "MAGICHATS" analysis = ProteinAnalysis("MAGICHATS", monoisotopic=False) weight = analysis.molecular_weight() # One Ser/Thr is assumed to be dehydrated, but not the other weight -= 18.02 self.assertEqual([weight], self.lant.alternative_weights)
def getMF(subSeq): listofaminoacids = [] #Dictionary for each amino acid with atoms for each A = {'C':3, 'H':7, 'N':1, 'O':2, 'S':0} R = {'C':6, 'H':14,'N':4, 'O':2, 'S':0} N = {'C':4, 'H':8, 'N':2, 'O':3, 'S':0} D = {'C':4, 'H':7, 'N':1, 'O':4, 'S':0} C = {'C':3, 'H':7, 'N':1, 'O':2, 'S':1} Q = {'C':5, 'H':10,'N':2, 'O':3, 'S':0} E = {'C':5, 'H':9, 'N':1, 'O':4, 'S':0} G = {'C':2, 'H':5, 'N':1, 'O':2, 'S':0} H = {'C':6, 'H':9, 'N':3, 'O':2, 'S':0} I = {'C':6, 'H':13,'N':1, 'O':2, 'S':0} L = {'C':6, 'H':13,'N':1, 'O':2, 'S':0} K = {'C':6, 'H':14,'N':2, 'O':2, 'S':0} M = {'C':5, 'H':11,'N':1, 'O':2, 'S':1} F = {'C':9, 'H':11,'N':1, 'O':2, 'S':0} P = {'C':5, 'H':9, 'N':1, 'O':2, 'S':0} S = {'C':3, 'H':7, 'N':1, 'O':3, 'S':0} T = {'C':4, 'H':9, 'N':1, 'O':3, 'S':0} W = {'C':11,'H':12,'N':2, 'O':2, 'S':0} Y = {'C':9, 'H':11,'N':1, 'O':3, 'S':0} V = {'C':5, 'H':11,'N':1, 'O':2, 'S':0} dictOfAmino = {'A':A,'R':R,'N':N,'D':D,'C':C,'Q':Q, 'E':E, 'G':G,'H':H,'I':I,'L':L,'K':K,'M':M,'F':F,'P':P,'S':S,'T':T,'W':W,'Y':Y,'V':V} mySeq = subSeq analysis = ProteinAnalysis(mySeq) listofaminoacids.append(analysis.count_amino_acids()) for i in listofaminoacids: carbonTotal = 0 hydrogenTotal = 0 oxygenTotal = 0 nitrogenTotal = 0 sulfurTotal = 0 peptideBonds = 0 for value in i: for amino in dictOfAmino: if value == amino: peptideBonds = peptideBonds + i[value] thisAmino = {} thisAmino = dictOfAmino[amino] carbonTotal = carbonTotal + (i[value]*thisAmino['C']) hydrogenTotal = hydrogenTotal + (i[value]*thisAmino['H']) oxygenTotal = oxygenTotal + (i[value]*thisAmino['O']) nitrogenTotal = nitrogenTotal + (i[value]*thisAmino['N']) sulfurTotal = sulfurTotal + (i[value]*thisAmino['S']) #Correcting totals for peptide bond loss of water peptideBonds = peptideBonds - 1 hydrogenTotal = hydrogenTotal -(peptideBonds*2) oxygenTotal = oxygenTotal - (peptideBonds*1) outString = "C" + str(carbonTotal) + "H" + str(hydrogenTotal) + "N" + str(nitrogenTotal) + "O" + str(oxygenTotal) + "S" + str(sulfurTotal) return outString
def generate_plot(key, my_seq): analysed_seq = ProteinAnalysis(my_seq) l = len(my_seq) window_size = 21 scale = analysed_seq.protein_scale(param_dict=amino_acids, window=window_size, edge=0.75) x = range((window_size+1)/2,len(scale)+(window_size+1)/2) lookahead = 7 minp, maxp = peakdetect(scale, lookahead=(lookahead+1)/2) start = min(x)-1 xpeaks = [xp[0]+(window_size+1)/2 for xp in minp] ypeaks = [scale[xpi-(window_size+1)/2] for xpi in xpeaks] t_x = np.array(scale) added_min = np.where(t_x < 0.9)[0] print(added_min) xdpeaks = [xdp[0]+(window_size+1)/2 for xdp in maxp] ydpeaks = [scale[xdpi-(window_size+1)/2] for xdpi in xdpeaks] num_pos = np.where(np.array(ydpeaks) < 0.9)[0].size print(num_pos) if num_pos == 0 and len(added_min) != 0: added_val = [scale[i] for i in list(added_min)] minimum = added_val.index(min(added_val))-start+2 print(added_min[minimum]) print(added_val[minimum]) xdpeaks.append(added_min[minimum]) ydpeaks.append(added_val[minimum]) print("maxs:",np.array(xpeaks)+start) print("mins:",np.array(xdpeaks)+start) #print(scale) plt.clf() plt.plot(x,scale,'b', xpeaks, ypeaks ,'ro', xdpeaks, ydpeaks ,'go') plt.grid(True) #plt.axis([0,max(x), min(scale)-0.05*min(scale), max(scale)+0.05*max(scale)]) #plt.axis([0,max(x), 0.85, max(scale)+0.05*max(scale)]) plt.legend( ['Scores for '+key])#,'local maxima', 'local minima' ]) plt.xlabel('Position') plt.ylabel('Score') plt.savefig('figs/'+key+'.png')
def properties(toxin_faa,antitoxin_faa,out): # Build a dictionary of {locus:[{properties:values},{properties:values}]} from collections import defaultdict loci = defaultdict(list) from Bio import SeqIO for f in [toxin_faa,antitoxin_faa]: # Parse FASTA files with open(f,'rU') as handle: for record in SeqIO.parse(handle,'fasta'): locus,start = getNameAndPosition(record) if not start: continue aaseq = str(record.seq).strip("*") # Omit sequences with missing positions or premature stops # give them 0 as flag for missing data instead if "*" not in aaseq and "X" not in aaseq: data = ProteinAnalysis(aaseq) loci[locus].append({ 'start': start, 'pI': data.isoelectric_point(), 'weight': data.molecular_weight(), 'instability': data.instability_index() }) else: loci[locus].append({ 'start': start, 'pI': 0, 'weight':0 , 'instability': 0 }) # Order genes in a locus positionally loci = orderPairs(loci) # Write to output fil outfile = ".".join([out,"properties","txt"]) with open(outfile,'w') as o: header = "\t".join(["locus", "gene1_pI","gene2_pI", "gene1_weight","gene2_weight", "gene1_instability","gene2_instability" ]) o.write("#"+ header.upper() + "\n") for locus, gene in loci.iteritems(): if len(gene) != 2: continue line = map(str, [ locus,gene[0]['pI'],gene[1]['pI'], gene[0]['weight'],gene[1]['weight'], gene[0]['instability'],gene[1]['instability'] ]) o.write("\t".join(line)+"\n") return outfile
def draw_sequence(sequence, mode = 'simple', alphabet = None): if mode == 'protparams': returndiv = DIV() from Bio.SeqUtils.ProtParam import ProteinAnalysis seq_div=DIV(_style='font-family:monospace',_class='raw-sequence') spacer=len(str(len(sequence)))+1 for i,pos in enumerate(sequence): if i==0: seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ',' '))) if i%10==0 and i!=0: seq_div.append(' ') if i%60==0 and i!=0: seq_div.append(XML((str(i)).ljust(spacer).replace(' ',' '))) seq_div.append(BR()) seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ',' '))) seq_div.append(SPAN(pos,_class='seq-position',_title = i+1)) returndiv.append(seq_div) returndiv.append(H3('Protein Parameters')) params_table = TABLE(_style= "width:200px;") protpar=ProteinAnalysis(sequence) params_table.append(TR(SPAN('Length:',_class = 'line-header'), '%i aa'%len(sequence))) try: params_table.append(TR(SPAN('MW:',_class = 'line-header'), '%i KDa'%round(protpar.molecular_weight()/1000,0))) except KeyError: pass try: params_table.append(TR(SPAN('pI:',_class = 'line-header'), '%1.2f'%protpar.isoelectric_point())) except KeyError: pass returndiv.append(params_table) return returndiv if mode == 'simple': seq_div=DIV(_style='font-family:monospace',_class='raw-sequence') spacer=len(str(len(sequence)))+1 for i,pos in enumerate(sequence): if i==0: seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ',' '))) if i%10==0 and i!=0: seq_div.append(' ') if i%60==0 and i!=0: seq_div.append(XML((str(i)).ljust(spacer).replace(' ',' '))) seq_div.append(BR()) seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ',' '))) seq_div.append(SPAN(pos,_class='seq-position', _title = i+1)) return seq_div
def main(): ieps = [] seqid = [] inputfile = "/isi/olga/xin/Halophile_project/output/20160421/SS37_aa.faa" outputfile = "/isi/olga/xin/Halophile_project/output/20160421/SS37_reads_isp.txt" f = open(inputfile, 'rU') sequences = SeqIO.parse(f, "fasta") for record in sequences: seqid.append(record.id) seq = str(record.seq) seq_pa = ProteinAnalysis(seq) ie = seq_pa.isoelectric_point() ieps.append(ie) read_ieps = np.column_stack((seqid, ieps)) df = pd.DataFrame(read_ieps) df.to_csv(outputfile, sep = '\t', header = False)
def getMW_average(subSeq): peptideBonds = 0 molecularWeight = 0.0 waterLoss = 18.015 listofaminoacids = [] #AVERAGE MW FOR EACH AMINO ACID CURRENTLY dictOfAmino = {'A':71.0788, 'R':156.1875, 'N':114.1038, 'D':115.0886, 'C':103.1388, 'Q':128.1307, 'E':129.1155, 'G':57.0519, 'H':137.1411, 'I':113.1594, 'L':113.1594, 'K':128.1741, 'M':131.1926, 'F':147.1766, 'P':97.1167, 'S':87.0782, 'T':101.1051, 'W':186.2132, 'Y':163.1760, 'V':99.1326} mySeq = subSeq analysis = ProteinAnalysis(mySeq) listofaminoacids.append(analysis.count_amino_acids()) for i in listofaminoacids: for value in i: for amino in dictOfAmino: if value == amino: peptideBonds = peptideBonds + i[value] #print dictOfAmino[value] #print i[value] molecularWeight = molecularWeight + (i[value]*dictOfAmino[value]) #peptideBonds = peptideBonds - 1 #molecularWeight = molecularWeight - (peptideBonds*waterLoss) molecularWeight = molecularWeight+waterLoss return molecularWeight
def getMW_mono(subSeq): peptideBonds = 0 molecularWeight = 0.0 waterLoss = 18.015 listofaminoacids = [] #MONOISOTOPIC MW FOR EACH AMINO ACID CURRENTLY dictOfAmino = {'A':71.03711, 'R':156.10111, 'N':114.04293, 'D':115.02694, 'C':103.00919, 'Q':128.05858, 'E':129.04259, 'G':57.02146, 'H':137.05891, 'I':113.08406, 'L':113.08406, 'K':128.09496, 'M':131.04049, 'F':147.06841, 'P':97.05276, 'S':87.03203, 'T':101.04768, 'W':186.07931, 'Y':163.06333, 'V':99.06841} mySeq = subSeq analysis = ProteinAnalysis(mySeq) listofaminoacids.append(analysis.count_amino_acids()) for i in listofaminoacids: for value in i: for amino in dictOfAmino: if value == amino: peptideBonds = peptideBonds + i[value] #print dictOfAmino[value] #print i[value] molecularWeight = molecularWeight + (i[value]*dictOfAmino[value]) #peptideBonds = peptideBonds - 1 #molecularWeight = molecularWeight - (peptideBonds*waterLoss) molecularWeight = molecularWeight+waterLoss return molecularWeight
def protParam(seq): params = ProteinAnalysis(seq) mw = params.molecular_weight() c_aa = params.count_amino_acids() p_aa = params.get_amino_acids_percent() gravy = params.gravy() aromaticity = params.aromaticity() isoelectric_point = params.isoelectric_point() ext_coeff = sum([c_aa["W"]*5690,c_aa["Y"]*1280,c_aa["C"]*120]) mgml = ext_coeff * (1./mw) print("Amino acid count") pprint.pprint(c_aa) print("Amino acid percent") pprint.pprint(p_aa) print("Molecular weight") print("%f Da"%mw) print("Gravy") print(gravy) print("Isoelectric point") print(isoelectric_point) print("Aromaticity") print(aromaticity) print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)"%ext_coeff) print("")
def get_protein_analysis(aa): protein_analysis = ProteinAnalysis(aa) analyze = [protein_analysis.molecular_weight(), protein_analysis.aromaticity(), protein_analysis.instability_index(), protein_analysis.isoelectric_point(), protein_analysis.gravy()] + list( protein_analysis.secondary_structure_fraction()) return analyze
def analyzeAMP(self): from Bio.SeqUtils.ProtParam import ProteinAnalysis self.netcharge() self.hphobFract() #self.aaPerc = self.pepParam.get_amino_acids_percent() self.pepParam = ProteinAnalysis(self.seq) self.data = {'charge': self.net, 'length': self.length, 'hydrophobic':self.hpf, 'aminoacids': self.pepParam.get_amino_acids_percent()} return self.data
def protein_analysis(): if session.username == None: redirect(URL(r=request,f='../account/log_in')) from Bio.SeqUtils.ProtParam import ProteinAnalysis form = FORM(TABLE( TR("Amino acid sequence: ", TEXTAREA(_type="text", _name="sequence", requires=IS_NOT_EMPTY())), INPUT(_type="submit", _value="SUBMIT"))) if form.accepts(request.vars,session): session['sequence'] = seqClean(form.vars.sequence.upper()) X = ProteinAnalysis(session['sequence']) session['aa_count'] = X.count_amino_acids() session['percent_aa'] = X.get_amino_acids_percent() session['mw'] = X.molecular_weight() session['aromaticity'] = X.aromaticity() session['instability'] = X.instability_index() session['flexibility'] = X.flexibility() session['pI'] = X.isoelectric_point() session['sec_struct'] = X.secondary_structure_fraction() redirect(URL(r=request, f='protein_analysis_output')) return dict(form=form)
def main(): #programm, mis kysib valgu fasta faili ja annab selle kohta parameetrid fasta = input() sequence = read_fasta(fasta) print(sequence) analysed_seq = ProteinAnalysis(str(sequence)) print("\n","Molekulaarmass:",analysed_seq.molecular_weight()) print("\n","Aminohapete arv:",analysed_seq.count_amino_acids()) print("\n","Isoelektriline punkt:",analysed_seq.isoelectric_point()) text_file = open("Valgu_parameetrid.txt", "w") text_file.write(str(analysed_seq.molecular_weight())) text_file.write("\n") text_file.write(str(analysed_seq.count_amino_acids())) text_file.write("\n") text_file.write(str(analysed_seq.isoelectric_point())) text_file.close()
def __init__(self, sequence): self.sequence = sequence self.sequence_length = len(sequence) analysis = ProteinAnalysis(sequence) self.amino_acid_percents = analysis.get_amino_acids_percent() self.amino_acids_composition = calculate_amino_acids_composition(sequence) self.aromaticity = analysis.aromaticity() self.instability = analysis.instability_index() self.flexibility = calculate_flexibility(sequence) protein_scale_parameters = [{'name': 'Hydrophilicity', 'dictionary': hw}, {'name': 'Surface accessibility', 'dictionary': em}, {'name': 'Janin Interior to surface transfer energy scale', 'dictionary': ja}, {'name': 'Bulkiness', 'dictionary': bulkiness}, {'name': 'Polarity', 'dictionary': polarity}, {'name': 'Buried residues', 'dictionary': buried_residues}, {'name': 'Average area buried', 'dictionary': average_area_buried}, {'name': 'Retention time', 'dictionary': retention_time}] self.protein_scales = calculate_protein_scales(analysis, protein_scale_parameters) self.isoelectric_point = analysis.isoelectric_point() self.secondary_structure_fraction = calculate_secondary_structure_fraction(analysis) self.molecular_weight = analysis.molecular_weight() self.kyte_plot = analysis.gravy() self.pefing = calculate_pefing(sequence) # next parameters are calculated using R.Peptides r('require(Peptides)') r('sequence = "{0}"'.format(sequence)) self.aliphatic_index = r('aindex(sequence)')[0] self.boman_index = r('boman(sequence)')[0] self.charges = calculate_charges(sequence, 1.0, 14.0, 0.5, 'Lehninger') self.hydrophobicity = r('seq(sequence)')[0] angles = [{'name': 'Alpha-helix', 'angle': -47}, {'name': '3-10-helix', 'angle': -26}, {'name': 'Pi-helix', 'angle': -80}, {'name': 'Omega', 'angle': 180}, {'name': 'Antiparallel beta-sheet', 'angle': 135}, {'name': 'Parallel beta-sheet', 'angle': 113}] if self.amino_acid_percents['P'] + self.amino_acid_percents['G'] > 0.3: angles.append({'name': 'Polygly-polypro helix', 'angle': 153}) self.hydrophobic_moments = calculate_hydrophobic_moments(sequence, angles) self.kidera_factors = calculate_kidera_factors(sequence) self.peptide_types = calculate_peptide_types(sequence, angles)
W = {'C':11,'H':12,'N':2, 'O':2, 'S':0} Y = {'C':9, 'H':11,'N':1, 'O':3, 'S':0} V = {'C':5, 'H':11,'N':1, 'O':2, 'S':0} dictOfAmino = {'A':A,'R':R,'N':N,'D':D,'C':C,'Q':Q, 'E':E, 'G':G,'H':H,'I':I,'L':L,'K':K,'M':M,'F':F,'P':P,'S':S,'T':T,'W':W,'Y':Y,'V':V} print "Note output file is appended if same file is selected twice molecular formulas \n for both runs will be present in output file" fileName = raw_input("Protein FASTA file to generate molecular formulas for: ") outFileName = raw_input("Output file name (include .txt): ") fasta_file = open(fileName, "rU") for record in SeqIO.parse(fasta_file, "fasta"): myseq = str(record.seq) analysis = ProteinAnalysis(myseq) listofaminoacids.append(analysis.count_amino_acids()) for i in listofaminoacids: carbonTotal = 0 hydrogenTotal = 0 oxygenTotal = 0 nitrogenTotal = 0 sulfurTotal = 0 peptideBonds = 0 for value in i: for amino in dictOfAmino:
from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.SeqUtils import ProtParamData from Bio import SeqIO with open('../../samples/pdbaa') as fh: for rec in SeqIO.parse(fh,'fasta'): myprot = ProteinAnalysis(str(rec.seq)) print(myprot.count_amino_acids()) print(myprot.get_amino_acids_percent()) print(myprot.molecular_weight()) print(myprot.aromaticity()) print(myprot.instability_index()) print(myprot.flexibility()) print(myprot.isoelectric_point()) print(myprot.secondary_structure_fraction()) print(myprot.protein_scale(ProtParamData.kd, 9, .4))
def main(argv): ## we use ArgumentParser, which requires 2.7 if sys.version_info < (2, 7): raise "This script requires python 2.7 or greater" ## add weight filtering functionality if BioPython is available try: from Bio.SeqUtils.ProtParam import ProteinAnalysis has_biopython = 1 except : has_biopython = 0 parser = argparse.ArgumentParser(description='Add abundance to FASTA files.') parser.add_argument('infile', type=argparse.FileType('r'), help='Input FASTA file') parser.add_argument('outfile', type=argparse.FileType('w'), help='Output FASTA file') parser.add_argument('--mu', dest='mu', action='store', default=3, help='mean of gaussian in log space') parser.add_argument('--sigma', dest='sigma', action='store', default=1, help='sd of gaussian in log space') parser.add_argument('--sample', dest='sample', action='store', default=0, help='Number of entries to keep (for sampling a bigger FASTA file)') parser.add_argument('--random', dest='random', action='store_true', help='Randomly shuffle entries before sampling (only if --sample is given). If not given, the first \'X\' samples are used.') if (has_biopython): parser.add_argument('--weight_low', dest='weight_low', action='store', default=0, help='minimum molecular weight of protein') parser.add_argument('--weight_up', dest='weight_up', action='store', default=0, help='Maximum molecular weight of protein (use 0 for unlimited)') else: print "Warning: protein weight filtering not supported, as BioPython module is not installed." ## argument parsing args = parser.parse_args() fileobj = args.infile fileoutobj = args.outfile sample_size = int(args.sample) sample_random = bool(args.random) if (has_biopython): weight_low = float(args.weight_low) weight_up = float(args.weight_up) if (weight_up <= 0): weight_up = sys.float_info.max ## list of final entries fasta_entries = [] for entry in nextEntry(fileobj): header = entry.header ## check if it contains 'intensity'? rep = re.compile(r"\[# *(.*) *#\]") m = rep.search(header) header_new = "" other = [] if (m): header_new = header.replace(m.group(0), "") ## delete meta for element in m.group(1).split(','): #print "element:", element if (element.find("intensity") == -1): other.append(element) else: header_new = header ## nothing to replace ## create new metainfo array i = "intensity=" + str(sampleAbundance(float(args.mu), float(args.sigma))) other.append(i) entry.header = header_new.rstrip() + "[# " + (", ").join(other) + " #]" if (has_biopython): sequence = "".join(entry.sequence.split("\n")) ## ## BioPython does not like some AA letters - they need replacement ## ## replace "U" (Selenocystein) with "C" (Cystein) sequence = sequence.replace("U","C") ## replace "X" (unknown) with "P" (Proline) [arbitrary choice - but weight of 115 is very close to averagine] sequence = sequence.replace("X","P") ## replace "B" (Asparagine or aspartic acid) with "N" (Asparagine) sequence = sequence.replace("B","N") ## replace "Z" (Glutamine or glutamic acid) with "Q" (Glutamine) sequence = sequence.replace("Z","Q") ## replace "Z" (Glutamine or glutamic acid) with "Q" (Glutamine) sequence = sequence.replace("Z","Q") ## replace "J" (Leucine or Isoleucine) with "L" (Leucine) sequence = sequence.replace("J","L") analysed_seq = ProteinAnalysis(sequence) weight = analysed_seq.molecular_weight() if (not(weight_low <= weight and weight <= weight_up)): continue fasta_entries.append(entry.header + "\n" + entry.sequence) ## only read to sample size (the rest is thrown away anyways) if (sample_size > 0 and not(sample_random)): if (len(fasta_entries) >= sample_size): break ## select subset (if required) if (sample_size > 0): indices = range(0,len(fasta_entries)) ## random sampling only makes sense if we take a subset if (sample_random and sample_size < len(fasta_entries)): random.shuffle(indices) indices = [indices[i] for i in range(0,sample_size)] fasta_entries = [fasta_entries[i] for i in indices] ## write to file for entry in fasta_entries: fileoutobj.write(entry)
protein_name = get_protein_name(line) protein_names_and_segments[protein_name] = get_segments(line) protein_names_and_sequences[protein_name] = '' else: sequence = protein_names_and_sequences.get(protein_name) sequence += line.strip('\n' and '\r' and '\r\n') protein_names_and_sequences[protein_name] = sequence for key in protein_names_and_segments.keys(): for segment in protein_names_and_segments.get(key): segment_sequence = protein_names_and_sequences.get(key)[segment[0] - 1:segment[1]] x += segment_sequence y = ProteinAnalysis(str(x)) z = y.get_amino_acids_percent() # visual for command line print 'parsing ' + FILE_INPUT + '\n' # build the output file as CSV with open('percent_AA_per_seg_OUTPUT.csv', 'wb') as f: w = csv.writer(f) w.writerows(z.items()) # opens the ouput file file = '/Users/simonkeng/senior-research-project/percent_AA_per_seg_OUTPUT.csv' open_file(file)
class amp: "stores all data of peptide" def __init__(self,readed): self.seq = readed[1] self.length = len(readed[1]) self.name = readed[0] def netcharge(self): #i don't thonk biopython calculates net charge self.pos = 'KRH' self.neg = 'DE' self.net = 0 self.posRe = 0 for i in self.seq: if i in self.pos: self.net += 1 # no self.posRe += 1 #need it for searching if i in self.neg: self.net -= 1 else: continue def hphobFract(self): #i don't know if biopython calculates just froaction of hphobs hph = 'ACFGILMPV' self.hpf = 0. for i in self.seq: if i in hph: self.hpf += 1 else: continue self.hpn = self.hpf self.hpf = self.hpf/self.length def analyzeAMP(self): from Bio.SeqUtils.ProtParam import ProteinAnalysis self.netcharge() self.hphobFract() #self.aaPerc = self.pepParam.get_amino_acids_percent() self.pepParam = ProteinAnalysis(self.seq) self.data = {'charge': self.net, 'length': self.length, 'hydrophobic':self.hpf, 'aminoacids': self.pepParam.get_amino_acids_percent()} return self.data def detectAMP(self): from Bio.SeqUtils.ProtParam import ProteinAnalysis import re import ConfigParser import numpy as np parser = ConfigParser.SafeConfigParser() parser.read('config.ini') "floating window and search for values" lowNet = parser.getfloat('Parameters','lowNet') #0 midNet = parser.getfloat('Parameters','midNet')#2 highNet = parser.getfloat('Parameters','highNet')#6 lowHpf = parser.getfloat('Parameters','lowHpf')#0.5 highHpf = parser.getfloat('Parameters','highHpf')#0.9 lowCompCoeff = parser.getfloat('Parameters','lowCompCoeff')#0.85 highCompCoeff = parser.getfloat('Parameters','highCompCoeff')#1.5 baseWind = parser.getint('Parameters','baseWind')#15 # maxWind = parser.getfloat('Parameters','maxWind')#100 thresh = parser.getint('Parameters','thresh')#6 minLen = parser.getint('Parameters','minLen')#10 # C R W H K D E baseCompose = [0.01,0.06,0.005,0.02,0.06,0.05,0.07] ampCompose = [0.06,0.09,0.01, 0.02,0.1, 0.02,0.03] changes = [i[1]/i[0] for i in zip(baseCompose,ampCompose)] upAvg = np.average(changes[:-2]) downAvg = np.average(changes[-2:]) self.result = [0 for i in self.seq] if self.length > baseWind*2: for i in range(self.length-baseWind): self.subPep = amp(['subPep',self.seq[i:i+baseWind]]) self.subPep.netcharge() self.subPep.hphobFract() #print self.subPep.net, self.subPep.hpf, i, i+baseWind self.pepParam = ProteinAnalysis(self.subPep.seq) self.aaPerc = self.pepParam.get_amino_acids_percent() self.subPepComp = [self.aaPerc[aminame] for aminame in ['C','R','W','H','K','D','E']] self.subPepChanges = [k[1]/k[0] for k in zip(baseCompose,self.subPepComp)] self.upSubAvg = np.average(self.subPepChanges[:-2]) self.downSubAvg = np.average(self.subPepChanges[-2:]) #really #really hate such muliticondidtional #print downAvg,',,,,,',self.downSubAvg if (((lowNet < self.subPep.net < highNet and\ self.subPep.hpf > lowHpf) or\ (midNet < self.subPep.net ) or \ (self.subPep.hpf > highHpf)) and\ self.upSubAvg > lowCompCoeff*upAvg) or\ self.upSubAvg > highCompCoeff*upAvg: for aa in range(i,i+baseWind): self.result[aa] += 1 else: continue else: self.subPep = self self.subPep.netcharge() self.subPep.hphobFract() self.pepParam = ProteinAnalysis(self.subPep.seq) self.aaPerc = self.pepParam.get_amino_acids_percent() self.subPepComp = [self.aaPerc[aminame] for aminame in ['C','R','W','H','K','D','E']] self.subPepChanges = [k[1]/k[0] for k in zip(baseCompose,self.subPepComp)] self.upSubAvg = np.average(self.subPepChanges[:-2]) self.downSubAvg = np.average(self.subPepChanges[-2:]) #print downAvg,',,,,,',self.downSubAvg if ((lowNet < self.subPep.net < highNet and\ self.subPep.hpf > lowHpf) or\ (midNet < self.subPep.net) or \ (self.subPep.hpf > lowCompCoeff*upAvg)) and\ self.upSubAvg > highCompCoeff*upAvg: self.result = [i+1 for i in self.result] else: pass self.thrRes = [] for val in self.result: if val > thresh: self.thrRes.append(1) else: self.thrRes.append(0) self.strRes= ''.join([str(i) for i in self.thrRes]) self.matches = re.split('0*',self.strRes) #for match in self.matches: self.matches = [match for match in self.matches if len(match) > minLen] if len(self.matches) > 0: # print 'found peptide of length ',len(self.matches[0]) return 'found peptide of length ' + str(len(self.matches[0])) else: return 'nothing found' def plotPred(self): import matplotlib.pylab as pl try: checker = self.result[0] except: self.detectAMP() pl.plot(self.thrRes,'.-') pl.savefig('testy.pdf')
from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.SeqUtils import ProtParamData import sys import json inp = json.loads(sys.argv[1]) seq = inp["Sequence"] X = ProteinAnalysis(seq) data = dict() if "MW" in inp["Options"]: data["MW"] = X.molecular_weight() if "EC280" in inp["Options"]: aa_count = X.count_amino_acids() if "hasDisulfide" in inp["Options"]: data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"] + 62.5 * aa_count["C"] else: data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"] if "PI" in inp["Options"]: data["PI"] = X.isoelectric_point() if "AACont" in inp["Options"]: ratios = X.get_amino_acids_percent() data["AACont"] = {aa: ratios[aa] * 100. for aa in ratios} print json.dumps(data)
text_out = QtGui.QTextEdit('Ribosomal Protein CSV : format (Protein Description,Mwt,pI)') # text out widget data_mwt = [] y_axis = [] x_axis = data_mwt for record in SeqIO.parse(seq_file, "fasta"): #for record in SeqIO.parse(seq_file, "fasta"): temp_seq=str(record.seq) analysis_seq=ProteinAnalysis(temp_seq) if ("ribosomal protein" in record.description or "ribosomal subunit" in record.description): #if ("ribosomal protein" in record.description or "ribosomal subunit" in record.description or "Ribosomal" in record.description): if (analysis_seq.molecular_weight() < 20000): data_mwt.append('%.2f'%(analysis_seq.molecular_weight())) y_axis.append(1) text_out.setTextColor(QColor('blue')) text_out.append(str(len(data_mwt)) + "," + record.description + "," + '%.2f'%(analysis_seq.molecular_weight()) + "," + '%.2f'%(analysis_seq.isoelectric_point())) #new=sorted(data_mwt) #data_mwt.append(list(zip(['%.2f'%(analysis_seq.molecular_weight())])))
class Peptide(PolyIon): """Peptide represents single protein chains in solution. Peptides properties are based entirely on analysis of the sequence of the peptide. """ _state = {'name': 'Name of the peptide.', 'sequence': 'Amino acid sequence of the peptide.' } _sequence = None _analysis = None # TODO: move h to function or constants. Unify with pitts? _h_max = 1 _h_min = 2./3. _h = 5./6. def __init__(self, name=None, sequence=None): self._name = name self._sequence = sequence self._analysis = ProteinAnalysis(str(self.sequence)) @property def molecular_weight(self): return SeqUtils.molecular_weight(self.sequence, 'protein') def charge(self, pH=None, ionic_strength=None, temperature=None, moment=1): """Return the time-averaged charge of the peptide. :param pH :param ionic_strength :param temperature """ pH, ionic_strength, temperature = \ self._resolve_context(pH, ionic_strength, temperature) amino_acid_count = self._analysis.count_amino_acids() pos_pKs = dict(positive_pKs) neg_pKs = dict(negative_pKs) nterm = self.sequence[0] cterm = self.sequence[-1] if nterm in pKnterminal: pos_pKs['Nterm'] = pKnterminal[nterm] if cterm in pKcterminal: neg_pKs['Cterm'] = pKcterminal[cterm] charge = IsoelectricPoint(self.sequence, amino_acid_count)._chargeR(pH, pos_pKs, neg_pKs) return charge**moment def isoelectric_point(self, ionic_strength=None, temperature=None): """Return the isoelectric point of the peptide.""" # _, ionic_strength, temperature = \ # self._resolve_context(None, ionic_strength, temperature) return self._analysis.isoelectric_point() def volume(self): """Return the approximate volume of the folded peptide in m^3.""" v = self.molecular_weight / avogadro / self.density() / lpm3 / gpkg return v def radius(self): """Return the approximate radius of the folded peptide in m.""" return (self.volume() * 3. / 4. / pi) ** (1. / 3.) def density(self): """Return the approximate density of the folded peptide in kg/L.""" return 1.410 + 0.145 * exp(-self.molecular_weight / 13.) def mobility(self, pH=None, ionic_strength=None, temperature=None): """Return the effective mobility of the ion in m^2/V/s. If a context solution is available, mobility uses the full Onsager-Fuoss correction to mobility. Otherwise, the Robinson-Stokes model is used. :param pH :param ionic_strength :param temperature """ pH, ionic_strength, temperature = \ self._resolve_context(pH, ionic_strength, temperature) mobility = self.charge(pH) * elementary_charge /\ (6 * pi * self._solvent.viscosity(temperature) * self.radius() * (1 + self.radius() / self._solvent.debye(ionic_strength, temperature) ) ) * self._h return mobility
def __init__(self, name=None, sequence=None): self._name = name self._sequence = sequence self._analysis = ProteinAnalysis(str(self.sequence))
#!/usr/bin/env python import sys from Bio import SeqIO from Bio.SeqUtils.ProtParam import ProteinAnalysis sys.stdout.write("ID\tMW\tIP\tgravy\tlength\tinstability\tmonoisotpoic\tSequence\n") for record in SeqIO.parse(sys.stdin, "fasta"): a = ProteinAnalysis(str(record.seq)) properties = list() properties.append(record.id) properties.append(a.molecular_weight()) properties.append(a.isoelectric_point()) properties.append(a.gravy()) properties.append(a.length) properties.append(a.instability_index()) properties.append(a.aromaticity()) # always last column to make the output more readable properties.append(a.sequence) sys.stdout.write( '\t'.join(map(str, properties))+"\n" )
def detectAMP(self): from Bio.SeqUtils.ProtParam import ProteinAnalysis import re import ConfigParser import numpy as np parser = ConfigParser.SafeConfigParser() parser.read('config.ini') "floating window and search for values" lowNet = parser.getfloat('Parameters','lowNet') #0 midNet = parser.getfloat('Parameters','midNet')#2 highNet = parser.getfloat('Parameters','highNet')#6 lowHpf = parser.getfloat('Parameters','lowHpf')#0.5 highHpf = parser.getfloat('Parameters','highHpf')#0.9 lowCompCoeff = parser.getfloat('Parameters','lowCompCoeff')#0.85 highCompCoeff = parser.getfloat('Parameters','highCompCoeff')#1.5 baseWind = parser.getint('Parameters','baseWind')#15 # maxWind = parser.getfloat('Parameters','maxWind')#100 thresh = parser.getint('Parameters','thresh')#6 minLen = parser.getint('Parameters','minLen')#10 # C R W H K D E baseCompose = [0.01,0.06,0.005,0.02,0.06,0.05,0.07] ampCompose = [0.06,0.09,0.01, 0.02,0.1, 0.02,0.03] changes = [i[1]/i[0] for i in zip(baseCompose,ampCompose)] upAvg = np.average(changes[:-2]) downAvg = np.average(changes[-2:]) self.result = [0 for i in self.seq] if self.length > baseWind*2: for i in range(self.length-baseWind): self.subPep = amp(['subPep',self.seq[i:i+baseWind]]) self.subPep.netcharge() self.subPep.hphobFract() #print self.subPep.net, self.subPep.hpf, i, i+baseWind self.pepParam = ProteinAnalysis(self.subPep.seq) self.aaPerc = self.pepParam.get_amino_acids_percent() self.subPepComp = [self.aaPerc[aminame] for aminame in ['C','R','W','H','K','D','E']] self.subPepChanges = [k[1]/k[0] for k in zip(baseCompose,self.subPepComp)] self.upSubAvg = np.average(self.subPepChanges[:-2]) self.downSubAvg = np.average(self.subPepChanges[-2:]) #really #really hate such muliticondidtional #print downAvg,',,,,,',self.downSubAvg if (((lowNet < self.subPep.net < highNet and\ self.subPep.hpf > lowHpf) or\ (midNet < self.subPep.net ) or \ (self.subPep.hpf > highHpf)) and\ self.upSubAvg > lowCompCoeff*upAvg) or\ self.upSubAvg > highCompCoeff*upAvg: for aa in range(i,i+baseWind): self.result[aa] += 1 else: continue else: self.subPep = self self.subPep.netcharge() self.subPep.hphobFract() self.pepParam = ProteinAnalysis(self.subPep.seq) self.aaPerc = self.pepParam.get_amino_acids_percent() self.subPepComp = [self.aaPerc[aminame] for aminame in ['C','R','W','H','K','D','E']] self.subPepChanges = [k[1]/k[0] for k in zip(baseCompose,self.subPepComp)] self.upSubAvg = np.average(self.subPepChanges[:-2]) self.downSubAvg = np.average(self.subPepChanges[-2:]) #print downAvg,',,,,,',self.downSubAvg if ((lowNet < self.subPep.net < highNet and\ self.subPep.hpf > lowHpf) or\ (midNet < self.subPep.net) or \ (self.subPep.hpf > lowCompCoeff*upAvg)) and\ self.upSubAvg > highCompCoeff*upAvg: self.result = [i+1 for i in self.result] else: pass self.thrRes = [] for val in self.result: if val > thresh: self.thrRes.append(1) else: self.thrRes.append(0) self.strRes= ''.join([str(i) for i in self.thrRes]) self.matches = re.split('0*',self.strRes) #for match in self.matches: self.matches = [match for match in self.matches if len(match) > minLen] if len(self.matches) > 0: # print 'found peptide of length ',len(self.matches[0]) return 'found peptide of length ' + str(len(self.matches[0])) else: return 'nothing found'
def iso_e(protS): """return the isoelectric point of protS string protein sequence""" from Bio.SeqUtils.ProtParam import ProteinAnalysis protA = ProteinAnalysis(protS) return protA.isoelectric_point()
import collections from Bio import SeqIO from Bio.SeqUtils.ProtParam import ProteinAnalysis import sys for rec in SeqIO.parse(sys.argv[1], "fasta"): x = ProteinAnalysis(str(rec.seq)) # if sys.argv[2] is "sort": for key, val in sorted(x.iteritems(), key=lambda (k,v): (v,k)): print "%s %s:%s" % (key, value) # else: # print rec.id, x.count_amino_acids()
def main(databasePassword, schemaProteins, tableProteinInfo, tableStability): # Define N-terminus half life values (explanation http://en.wikipedia.org/wiki/N-end_rule and the ProtParam tool). halfLife = {'A' : 4.4, 'C' : 1.2, 'D' : 1.1, 'E' : 1.0, 'F' : 1.1, 'G' : 30.0, 'H' : 3.5, 'I' : 20.0, 'K' : 1.3, 'L' : 5.5, 'M' : 30.0, 'N' : 1.4, 'P' : 20.0, 'Q' : 0.8, 'R' : 1.0, 'S' : 1.9, 'T' : 7.2, 'V' : 100.0, 'W' : 2.8, 'Y' : 2.8} # Extract all the sequences stored in the database. conn, cursor = mysql.openConnection(databasePassword, schemaProteins) cursor = mysql.tableSELECT(cursor, 'UPAccession, Sequence', tableProteinInfo) results = cursor.fetchall() # Calculate the half life and instability index for each protein. stabilityTuples = [] for i in results: sequence = i[1] if halfLife.has_key(sequence[0]): protHalfLife = halfLife[sequence[0]] else: # This will occur when the N-terminal is not an amino acid with an associated half-life value (e.g. X, B, etc.) protHalfLife = -1 analysedSeq = ProteinAnalysis(sequence) try: instabilityIndex = analysedSeq.instability_index() except: instabilityIndex = -1 print '\tContains invalid aa code: ', i[0] stabilityTuples.append(tuple([i[0], protHalfLife, instabilityIndex])) cursor.execute('TRUNCATE TABLE ' + tableStability) values = '(' + ('%s,' * len(stabilityTuples[0])) values = values[:-1] + ')' mysql.tableINSERT(cursor, tableStability, values, stabilityTuples) mysql.closeConnection(conn, cursor) #def instability_index(prot, sequence): # # # A two dimentional dictionary for calculating the instability index. # # Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990). # # It is based on dipeptide values therefore the vale for the dipeptide DG is DIWV['D']['G']. # DIWV = {'A': {'A': 1.0, 'C': 44.94, 'E': 1.0, 'D': -7.49, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': -7.49, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, # 'C': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 20.26, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 33.60, # 'K': 1.0, 'M': 33.60, 'L': 20.26, 'N': 1.0, # 'Q': -6.54, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': 33.60, 'W': 24.68, 'V': -6.54, 'Y': 1.0}, # 'E': {'A': 1.0, 'C': 44.94, 'E': 33.60, 'D': 20.26, # 'G': 1.0, 'F': 1.0, 'I': 20.26, 'H': -6.54, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': 1.0, # 'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0}, # 'D': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0, # 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 1.0, 'P': 1.0, 'S': 20.26, 'R': -6.54, # 'T': -14.03, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, # 'F': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 13.34, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': -14.03, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 33.601}, # 'I': {'A': 1.0, 'C': 1.0, 'E': 44.94, 'D': 1.0, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 13.34, # 'K': -7.49, 'M': 1.0, 'L': 20.26, 'N': 1.0, # 'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, # 'G': {'A': -7.49, 'C': 1.0, 'E': -6.54, 'D': 1.0, # 'G': 13.34, 'F': 1.0, 'I': -7.49, 'H': 1.0, # 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': -7.49, # 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0, # 'T': -7.49, 'W': 13.34, 'V': 1.0, 'Y': -7.49}, # 'H': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': -9.37, 'F': -9.37, 'I': 44.94, 'H': 1.0, # 'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 24.68, # 'Q': 1.0, 'P': -1.88, 'S': 1.0, 'R': 1.0, # 'T': -6.54, 'W': -1.88, 'V': 1.0, 'Y': 44.94}, # 'K': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': -7.49, 'F': 1.0, 'I': -7.49, 'H': 1.0, # 'K': 1.0, 'M': 33.60, 'L': -7.49, 'N': 1.0, # 'Q': 24.64, 'P': -6.54, 'S': 1.0, 'R': 33.60, # 'T': 1.0, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, # 'M': {'A': 13.34, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 58.28, # 'K': 1.0, 'M': -1.88, 'L': 1.0, 'N': 1.0, # 'Q': -6.54, 'P': 44.94, 'S': 44.94, 'R': -6.54, # 'T': -1.88, 'W': 1.0, 'V': 1.0, 'Y': 24.68}, # 'L': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, # 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': -7.49, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 33.60, 'P': 20.26, 'S': 1.0, 'R': 20.26, # 'T': 1.0, 'W': 24.68, 'V': 1.0, 'Y': 1.0}, # 'N': {'A': 1.0, 'C': -1.88, 'E': 1.0, 'D': 1.0, # 'G': -14.03, 'F': -14.03, 'I': 44.94, 'H': 1.0, # 'K': 24.68, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': -6.54, 'P': -1.88, 'S': 1.0, 'R': 1.0, # 'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 1.0}, # 'Q': {'A': 1.0, 'C': -6.54, 'E': 20.26, 'D': 20.26, # 'G': 1.0, 'F': -6.54, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, # 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 1.0, # 'T': 1.0, 'W': 1.0, 'V': -6.54, 'Y': -6.54}, # 'P': {'A': 20.26, 'C': -6.54, 'E': 18.38, 'D': -6.54, # 'G': 1.0, 'F': 20.26, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': -6.54, 'L': 1.0, 'N': 1.0, # 'Q': 20.26, 'P': 20.26, 'S': 20.26, 'R': -6.54, # 'T': 1.0, 'W': -1.88, 'V': 20.26, 'Y': 1.0}, # 'S': {'A': 1.0, 'C': 33.60, 'E': 20.26, 'D': 1.0, 'G': 1.0, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 20.26, 'P': 44.94, 'S': 20.26, 'R': 20.26, # 'T': 1.0, 'W': 1.0, 'V': 1.0, 'Y': 1.0}, # 'R': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 20.26, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': 13.34, 'Q': 20.26, 'P': 20.26, 'S': 44.94, 'R': 58.28, # 'T': 1.0, 'W': 58.28, 'V': 1.0, 'Y': -6.54}, # 'T': {'A': 1.0, 'C': 1.0, 'E': 20.26, 'D': 1.0, 'G': -7.49, 'F': 13.34, 'I': 1.0, 'H': 1.0, # 'K': 1.0, 'M': 1.0, 'L': 1.0, 'N': -14.03, 'Q': -6.54, 'P': 1.0, 'S': 1.0, 'R': 1.0, # 'T': 1.0, 'W': -14.03, 'V': 1.0, 'Y': 1.0}, # 'W': {'A': -14.03, 'C': 1.0, 'E': 1.0, 'D': 1.0, 'G': -9.37, 'F': 1.0, 'I': 1.0, 'H': 24.68, # 'K': 1.0, 'M': 24.68, 'L': 13.34, 'N': 13.34, 'Q': 1.0, 'P': 1.0, 'S': 1.0, 'R': 1.0, # 'T': -14.03, 'W': 1.0, 'V': -7.49, 'Y': 1.0}, # 'V': {'A': 1.0, 'C': 1.0, 'E': 1.0, 'D': -14.03, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 1.0, # 'K': -1.88, 'M': 1.0, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 20.26, 'S': 1.0, 'R': 1.0, # 'T': -7.49, 'W': 1.0, 'V': 1.0, 'Y': -6.54}, # 'Y': {'A': 24.68, 'C': 1.0, 'E': -6.54, 'D': 24.68, 'G': -7.49, 'F': 1.0, 'I': 1.0, 'H': 13.34, # 'K': 1.0, 'M': 44.94, 'L': 1.0, 'N': 1.0, 'Q': 1.0, 'P': 13.34, 'S': 1.0, 'R': -15.91, # 'T': -7.49, 'W': -9.37, 'V': 1.0, 'Y': 13.34}, # } # # score = 0.0 # for i in range(len(sequence) - 1): # if DIWV.has_key(sequence[i]): # if DIWV[sequence[i]].has_key(sequence[i+1]): # score += DIWV[sequence[i]][sequence[i+1]] # return (10.0 / len(sequence)) * score
from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio import SeqIO import sys handle = open(sys.argv[1], 'rU') records = list(SeqIO.parse(handle, "fasta")) for record in records: prot = ProteinAnalysis(str(record.seq)) print prot.isoelectric_point()
def calc_isoelectric_point(self) -> float: """ using http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam-pysrc.html :return: calculates the sequence's isoelectric point """ protein_analysis = ProteinAnalysis(self.get_seq()) return protein_analysis.isoelectric_point()