def molecular_weight(self): """ Calculates the moleular weight of DNA based on the DNA GC-content and length :return: """ g = self.gc_ratio # DNA mass (BioPython has g.mol^-1, while we are in mmol) ma = molecular_weight( 'A', seq_type='DNA') / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 mt = molecular_weight( 'T', seq_type='DNA') / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 mc = molecular_weight( 'C', seq_type='DNA') / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 mg = molecular_weight( 'G', seq_type='DNA') / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 # [ g.mmol(bp)^-1 * mmol(bp)/mmol(dna) ] ^ -1 return 1 / ((1 - g) * (ma + mt) + g * (mc + mg)) * self.len
def define_dna_weight_constraint(model, dna, dna_ggdw, gc_content, chromosome_len): # DNA mass (BioPython has g.mol^-1, while we are in mmol) ma = molecular_weight( 'A', seq_type='DNA') / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 mt = molecular_weight( 'T', seq_type='DNA') / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 mc = molecular_weight( 'C', seq_type='DNA') / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 mg = molecular_weight( 'G', seq_type='DNA') / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 # g.mmol(bp)^-1 * mmol(bp)/mmol(dna) * mmol(dna).gDW^-1 tot_dna = ((1 - gc_content) * (ma + mt) + gc_content * (mc + mg)) * chromosome_len * dna.concentration # MW_avg*[DNA] = mRNA_ggdw # 1/scaling because the [X]s are scaled (eg mmol.ggDW^-1 -> back to mol.ggDW^1) mass_variable_def = tot_dna - dna_ggdw model.add_constraint( kind=InterpolationConstraint, hook=model, id_=DNA_WEIGHT_CONS_ID, expr=mass_variable_def, lb=0, ub=0, )
def mw_DNA(seq=None, seq_type='DNA', double_stranded=True, **kw): """ :param seq: DNA sequence; otherwise just gets one average bp mass :return: Molecular weight, in daltons """ opt = dict(seq_type=seq_type, double_stranded=double_stranded, **kw) if (seq is None): mass = [ molecular_weight(seq=str(s), **opt) for s in ["A", "T", "G", "C"] ] return np.mean(mass) # POST: some sequence to use return molecular_weight(seq=seq, **opt)
def singanlys(filenm,lmin,lmax,ctime,warn): from Bio.SeqUtils import GC, molecular_weight singlefilenm="single_"+filenm.replace(".","")+"_"+ctime+".csv" outpbyseqcsv = open(singlefilenm,"w") #Creating the file to write the output outpbyseqcsv.write('N,ID,Type,Length,%GC,Mol Weight\n') #Headers line. idsnocom=[] #To store the ID of each seq without commas. molwght=[] #To store the molecular weight of each sequence. multifs = open(filenm,"rU") for indsq in SeqIO.parse(multifs, "fasta"): evalen=lenfilter(str(indsq.seq),str(indsq.id),lmin,lmax,warn) if evalen[0]: #Use the sequence if pass the filter. newid = indsq.id.replace(",", " ") idsnocom.append(newid) try: #Try calculate the molecular weight if possible, else pass. molwght.append(molecular_weight(str(indsq.seq),evalen[1])) except: molwght.append(0) else: #If does not pass the filter, ignore. pass count=0 for i in range(len(contgslen)): #This is a external resource of the function. If you want to make the function independent from the rest of the code, provide the contgslen list as input. count+=1 newlinecsv=str(count)+','+str(idsnocom[i])+','+typesofseqs[i]+','+str(contgslen[i])+','+"{0:.2f}".format(gcs[i])+','+"{0:.2f}".format(molwght[i])+'\n' #If we want to use the function independently, we need to change this line to: #newlinecsv=str(count)+','+str(idsnocom[i])+','+str(listcntgslen[i])+','+"{0:.2f}".format(listgcs[i])+','+"{0:.2f}".format(molwght[i])+'\n' #print newlinecsv #I left this line because maybe someone wants to see the output in the python interpreter. outpbyseqcsv.write(newlinecsv) multifs.close() outpbyseqcsv.close() print 'A .csv file has been wrote with the single sequence stats in\nyour current working directory:' print '('+singlefilenm+')' print '\nThank you!'
def gfp_part_clip_mass(gfp_basicpart): from Bio.SeqUtils import molecular_weight CLIP_VOLUME = 30 return (2.5 * molecular_weight( gfp_basicpart.seq, double_stranded=True, circular=True) / 1e6 * CLIP_VOLUME)
def molecular_weight(self): """ Calculate molecular weight based on stuff """ if len(self.sequence) == 0: return 0 # get the material type if 'DNA' in self.material: seq_type = 'DNA' elif 'RNA' in self.material: seq_type = 'RNA' elif 'protein' in self.material: seq_type = 'protein' # get double stranded state if 'ds' in self.material: double_stranded = True else: double_stranded = False # get circular state circular = (self.shape == 'circular') # find MW value mw = molecular_weight(self.sequence, seq_type=seq_type, double_stranded=double_stranded, circular=circular) # create unit object self._molecular_weight = Unit('{} g/mol'.format(mw)) # return value with unit return self._molecular_weight
def sequence_weight(sequence): "Return weight in Daltons" ambigous_count = sequence.count('X') mod_seqeunce = sequence.replace('X', '') weight = molecular_weight(mod_seqeunce, seq_type='protein') weight = weight + 110 * ambigous_count # Estimate return weight
def DetectorRangeCoverage(arr, lower, upper): count = 0 for pep in arr: mw = molecular_weight(pep.toString().decode("utf-8"), 'protein') if(mw >= lower and mw <= upper): count += 1 return(100*count / len(arr))
def _calc_mw(self): from Bio.SeqUtils import molecular_weight try: mw = molecular_weight( seq=self.seq, seq_type=self.molecule, double_stranded=self.is_double_stranded, circular=self.is_circular, ) # For some reason Biopython just assumes 5' phosphorylation, so we # need to correct for that here. if not self.is_phosphorylated_5: num_strands = 2 if self.is_double_stranded else 1 num_ends = 0 if self.is_circular else num_strands hpo3 = 1.008 + 30.974 + 3 * 15.999 mw -= hpo3 * num_ends return mw except QueryError: pass try: self._cache_stranded_molecule() molecule = self._molecule, self._strandedness return mw_from_length(self.length, molecule) except QueryError: pass raise QueryError( "need sequence or length to calculate molecular weight")
def test_get_molecular_weight_identical(self): """Confirm protein molecular weight agrees with calculation from Bio.SeqUtils.""" # This test is somehow useless, since ProteinAnalysis.molecular_weight # is internally calling SeqUtils.molecular_weight. mw_1 = self.analysis.molecular_weight() mw_2 = molecular_weight(Seq(self.seq_text), seq_type="protein") self.assertAlmostEqual(mw_1, mw_2)
def molecular_weight(self): if not self._molecular_weight_override: return molecular_weight( self.peptide, seq_type='protein' ) / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 else: return self._molecular_weight_override
def test_get_monoisotopic_molecular_weight_identical(self): """Confirm protein molecular weight agrees with calculation from Bio.SeqUtils.""" self.analysis = ProtParam.ProteinAnalysis(self.seq_text, monoisotopic=True) mw_1 = self.analysis.molecular_weight() mw_2 = molecular_weight(Seq(self.seq_text, IUPAC.protein), monoisotopic=True) self.assertAlmostEqual(mw_1, mw_2)
def molecular_weight(self): if not self._molecular_weight_override: return molecular_weight(self.rna, seq_type='RNA') / 1000 # g.mol^-1 -> # kg.mol^-1 (SI) = # g.mmol^-1 else: return self._molecular_weight_override
def calc_mass(protein_string): """Returns total weight of given protein string using monoisotopic mass table""" #Remove mass of single water molecule as considering peptides excised from middle of protein total_mass = molecular_weight(protein_string, "protein", monoisotopic=True) - 18.01056 return total_mass
def calculate_mw(fname): target_chains = [str(chain.seq) for chain in list(SeqIO.parse(fname, "fasta"))] target_chains = list(set(target_chains)) mw = 0.0 for seq in target_chains: seq = seq.replace("X", "A") mw += round(molecular_weight(seq, "protein"), 2) return mw
def molecular_weight(self): g = self.gc_ratio # DNA mass (BioPython has g.mol^-1, while we are in mmol) ma = molecular_weight( 'A', seq_type='DNA') / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 mt = molecular_weight( 'T', seq_type='DNA') / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 mc = molecular_weight( 'C', seq_type='DNA') / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 mg = molecular_weight( 'G', seq_type='DNA') / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 # [ g.mmol(bp)^-1 * mmol(bp)/mmol(dna) ] ^ -1 return 1 / ((1 - g) * (ma + mt) + g * (mc + mg)) * self.len
def add_dummy_peptide(model, aa_ratios, dummy_gene, peptide_length): # Create a dummy peptide dummy_peptide = Peptide(id='dummy_peptide', name='Dummy peptide', gene_id=dummy_gene.id) aa_weights = [ v * molecular_weight(k, 'protein') for k, v in aa_ratios.items() ] dummy_peptide.molecular_weight = peptide_length * sum( aa_weights) / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 dummy_peptide._model = model model.peptides += [dummy_peptide] return dummy_peptide
def _extract_protein_data(object): try: from Bio.SeqUtils import molecular_weight from Bio.Seq import Seq protein_data = { 'frame1': str(object.protein).upper(), 'aa_count': {}, 'molecular_weight_f1': molecular_weight(object.protein.upper().replace("*", ""), seq_type="protein"), } for aa in "FLSYCWPHQRIMTNKVADEG*": protein_data['aa_count'][f"{aa}"] = object.protein.count(aa) try: new_sequence = Seq(object.coding_dna) protein_data['frame2'] = str(new_sequence[1:].translate( object.translation_table)) protein_data['molecular_weight_f2'] = molecular_weight( protein_data["frame2"].replace("*", ""), seq_type="protein") protein_data['frame3'] = str(new_sequence[2:].translate( object.translation_table)) protein_data['molecular_weight_f3'] = molecular_weight( protein_data["frame3"].replace("*", ""), seq_type="protein") except: pass except Exception as e: print(e) protein_data = None return protein_data
def test_new_part_resuspension(gfp_orf_basicpart): from Bio.SeqUtils import molecular_weight print(f"length of basicpart: {len(gfp_orf_basicpart.seq)}") print(f"estimated MW: {len(gfp_orf_basicpart.seq*660)}") print( f"biopython MW: {molecular_weight(gfp_orf_basicpart.seq, double_stranded=True)}" ) mass = 750 vol = bsb.new_part_resuspension(part=gfp_orf_basicpart, mass=mass) print(f"Calculated volume of resuspension buffer: {vol}") mw = molecular_weight(gfp_orf_basicpart.seq, double_stranded=True) print(f"estimated concentration: {mass*1e-9/(vol*1e-6*mw)*1e9}") assert 75 == round(mass * 1e-9 / (vol * 1e-6 * mw) * 1e9)
def add_dummy_mrna(model, dummy_gene, mrna_kdeg, mrna_length, nt_ratios): h2o = model.essentials['h2o'] h = model.essentials['h'] ppi = model.essentials['ppi'] # Create a dummy mRNA dummy_mrna = mRNA(id='dummy_gene', name='dummy mRNA', kdeg=mrna_kdeg, gene_id=dummy_gene.id) nt_weights = [v * molecular_weight(k, 'RNA') for k, v in nt_ratios.items()] dummy_mrna.molecular_weight = mrna_length * sum( nt_weights) / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 model.add_mrnas([dummy_mrna], add_degradation=False) dummy_transcription = TranscriptionReaction( id=model._get_transcription_name(dummy_mrna.id), name='Dummy Transcription', gene_id=dummy_gene.id, enzymes=model.rnap.values(), scaled=True) model.add_reactions([dummy_transcription]) model.transcription_reactions += [dummy_transcription] # Use the input ratios to make the stoichiometry transcription_mets = { model.metabolites.get_by_id(model.rna_nucleotides[k]): -1 * v * mrna_length for k, v in nt_ratios.items() } transcription_mets[ppi] = mrna_length dummy_transcription.add_metabolites(transcription_mets, rescale=True) # Add the degradation mrna_deg_stoich = { model.metabolites.get_by_id(model.rna_nucleotides_mp[k]): -1 * v * mrna_length for k, v in nt_ratios.items() } mrna_deg_stoich[h2o] = -1 * mrna_length mrna_deg_stoich[h] = 1 * mrna_length model._make_degradation_reaction(deg_stoich=mrna_deg_stoich, macromolecule=dummy_mrna, kind=mRNADegradation, scaled=True) model.add_mass_balance_constraint(dummy_transcription, dummy_mrna) return dummy_mrna
def molecular_weight(self, record): ''' Input: - record: a SeqRecord Output: - float: representing the molecular weight of the protein ''' PA = ProteinAnalysis(str(record.seq)) counter = Counter(str(record.seq)) non_prot_count = sum( [v for k, v in counter.items() if k not in self.amino_acids]) cleaned_seq = Seq( ''.join(c for c in str(record.seq) if c in self.amino_acids), IUPAC.protein) mol_weight = molecular_weight(seq=cleaned_seq, monoisotopic=PA.monoisotopic) avg_mol_weight = mol_weight / float(len(cleaned_seq)) return mol_weight + non_prot_count * avg_mol_weight
def test_get_molecular_weight_identical(self): """Confirm protein molecular weight agrees with calculation from Bio.SeqUtils.""" mw_1 = self.analysis.molecular_weight() mw_2 = molecular_weight(Seq(self.seq_text, IUPAC.protein)) self.assertAlmostEqual(mw_1, mw_2)
seq = "" for exon in allExons: useq = exon.sequence(myFasta, use_strand=False) seq += useq seq = seq.upper() if t.strand == "-": seq = Seq(seq).reverse_complement() else: seq = Seq(seq) try: aaSeq = seq.translate(stop_symbol="") aaWeight = molecular_weight(aaSeq, seq_type="protein") aaWeightMono = molecular_weight(aaSeq, seq_type="protein", monoisotopic=True) # protein_id gene_symbol mol_weight_kd mol_weight print("{}\t{}\t{}\t{}\tnormal".format(geneName.upper(), geneName, aaWeight / 1000, aaWeight), file=sys.stdout) print("{}\t{}\t{}\t{}\tmonoisotopic".format( geneName.upper(), geneName, aaWeightMono / 1000, aaWeightMono), file=sys.stdout) except: print("error", file=sys.stderr) print(transcriptID, geneName, file=sys.stderr) print(seq, file=sys.stderr)
def molecular_weight(self): """Calculate MW from Protein sequence""" return molecular_weight(self.sequence, monoisotopic=self.monoisotopic)
def calc_molecular_weight(self) -> float: """ :return: protein seq molecular weight, float """ return molecular_weight(self.get_seq(), seq_type='protein')
locusTag = str(feature.qualifiers["locus_tag"]).strip("[']") # count number of CXXCH motifs found in each protein coding sequence motifCount = len(hemeBindingMotifs) # cast translated amino acid sequence as string with [' AASEQ '] characters removed AAseq = str(feature.qualifiers["translation"]).strip("[']") # determine length of amino acid sequence **[' AASEQ '] characters must be removed for an accurate count! AAlength = len(str(feature.qualifiers["translation"]).strip("[']")) # if no ambiguous AAs present, calculate each cytochrome's molecular weight AmbiguousAA = re.findall('[BXZJUO]', str(feature.qualifiers["translation"])) if not AmbiguousAA: MolecularWeight = molecular_weight(AAseq, "protein") # calculate heme density as number of hemes per kDa HemeDensity = (float(motifCount) / MolecularWeight) * 1000 if multihemeCytochromes: # add gene name to FASTA definition line if present in CDS feature qualifiers if 'gene' in feature.qualifiers: # define GeneName variable GeneName = str(feature.qualifiers["gene"]).strip("[']") # define output string in FASTA format if cytochromes were predicted OutputString = "%s\t%s\t%s\t%i\t%i\t%i\t%i\t%1.2f\t%1.3f\t%s\t%s\t%s" % (locusTag, str(feature.qualifiers["product"]).strip("[']"), GeneName, motifCount, len(CXXCHmotifs), len(CXXXCHmotifs), AAlength, float(MolecularWeight / 1000), HemeDensity, OrganismName.replace("_", " ").replace("sp ", "sp. "), RecordName, AAseq)
from Bio.Seq import Seq from Bio.Alphabet import generic_protein from Bio.SeqUtils import molecular_weight with open('input.txt', 'r') as file: for line in file: protein_seq = line.strip('\n') print('%0.3f' % (molecular_weight(Seq(protein_seq, generic_protein), monoisotopic=True) - 18.01056))
def test_get_monoisotopic_molecular_weight_identical(self): "Test calculating the protein molecular weight agrees with calculation from Bio.SeqUtils" self.analysis = ProtParam.ProteinAnalysis(self.seq_text, monoisotopic=True) mw_1 = self.analysis.molecular_weight() mw_2 = molecular_weight(Seq(self.seq_text, IUPAC.protein), monoisotopic=True) self.assertAlmostEqual(mw_1, mw_2)
def test_get_molecular_weight_identical(self): "Test calculating the protein molecular weight agrees with calculation from Bio.SeqUtils" mw_1 = self.analysis.molecular_weight() mw_2 = molecular_weight(Seq(self.seq_text, IUPAC.protein)) self.assertAlmostEqual(mw_1, mw_2)
from Bio.SeqUtils import molecular_weight import pyperclip x = pyperclip.paste() result = ("%0.3f" % molecular_weight(x, "protein")) pyperclip.copy(result) print(result)
mono_index = i + steps[0] mono_M = float(file[mono_index].split(">")[1].split("<")[0]) mod = file[i + 1:PTM_end] sorted_seq = ''.join(sorted(seq)) t = cono(seq, mono_M, mod) #print("Seq: {0}, mono_mass: {1}, PTM: {2}".format(seq, mono_M, mod)) cyc_num = seq.count("C") calculated_mass = molecular_weight( seq, "protein", monoisotopic=True) - 2 * int(cyc_num / 2) out = "Seq: {0}, mono_mass: {1}, calculated_mass: {2}, diff: {3}, PTM: {4}".format( seq, mono_M, calculated_mass, calculated_mass - mono_M, mod) #only pull out th Carboxylic E, Hydro-proline, Bromide-W modification screen = [(("Gla" in i) or ("O" in i) or ("BTr" in i)) for i in mod] if np.all(np.array(screen) == 1) and sorted_seq not in seq_lib: print(out + "\n") output.write("####seq{0}####\n".format(s)) output.write(out + "\n") cono_server.append(t)
def molecular_weight(self): if not self._molecular_weight_override: return molecular_weight( self.rna) / 1000 # g.mol^-1 -> kg.mol^-1 (SI) = g.mmol^-1 else: return self._molecular_weight_override
def mw(self): return molecular_weight(self.seq, 'protein')
from Bio.Seq import Seq my_seq = Seq("AGTACACTGGT") print my_seq print my_seq[10] print my_seq[1:5] print len(my_seq) print my_seq.count( "A" ) from Bio.SeqUtils import GC, molecular_weight print "GC: ", GC( my_seq ) print molecular_weight( my_seq ) from Bio.Alphabet import IUPAC my_dna = Seq("AGTACATGACTGGTTTAG", IUPAC.unambiguous_dna) print my_dna print print my_dna.alphabet print my_dna.reverse_complement() print my_dna.translate()
def main(*args, **kwargs): fpath = os.path.join(os.getcwd(),args[-1]) s = str(StrongHold.parserDNAFile(fpath)) mass = molecular_weight(s, seq_type='protein', circular=True, monoisotopic=True) print '%.3f' % mass
def molecular_weight(self): """Calculate MW from Protein sequence.""" return molecular_weight(self.sequence, monoisotopic=self.monoisotopic)
import pandas as pd import numpy as np from Bio.SeqUtils import molecular_weight from Bio import SeqIO from collections import Counter AA_LETTERS = sorted("ACEDGFIHKMLNQPSRTWVY") out = pd.DataFrame(columns=[aa for aa in AA_LETTERS]) # record = SeqIO.read('../supporting_data/U00096.gb', "gb") ## count amino acids per ORF and write to csv file i = 0 out = {} for r in record.features: if r.type == 'CDS': data = r.qualifiers i += 1 try: data['molecular_weight[Da]'] = molecular_weight(data['translation'][0], seq_type='protein') except KeyError: continue out[i] = data out = pd.DataFrame.from_dict(out).T out.to_csv('ecoli_genome_info.tsv', sep='\t') #schmidtMW = schmidt['MW [kDa]'] #peeboMW = peebo['MW [kDa]']