def liftover_mods(ref_seq, ref_nc_seq, seq, can_monomers=can_monomers): ref_form = bpforms.RnaForm().from_str(ref_nc_seq) form = bpforms.RnaForm() i_nc_nt = 0 for i_nt, ref_monomer in enumerate(ref_seq): if i_nc_nt < len(ref_form.seq): ref_nc_monomer = ref_form.seq[i_nc_nt] else: ref_nc_monomer = None if ref_monomer != '-': i_nc_nt += 1 monomer = seq[i_nt] if monomer == '-': continue elif ref_nc_monomer not in can_monomers and monomer == ref_monomer: form.seq.append(ref_nc_monomer) elif monomer == 'N': form.seq.append(bpforms.Monomer(id='N')) else: form.seq.append(bpforms.rna_alphabet.monomers.get(monomer)) # verify non-canonical sequences are consistent with the canonical sequences assert seq.replace('-', '') == form.get_canonical_seq() return form
def analyze_form(rna_form, unsupported_codes, results_dict): results_dict['Sequence (BpForms)'] = str(rna_form) results_dict['Sequence (IUPAC)'] = canonical_seq = rna_form.get_canonical_seq() results_dict['Length'] = len(rna_form.seq) results_dict['Number of modifications'] = len(rna_form.seq) \ - rna_form.seq.count(bpforms.rna_alphabet.monomers.A) \ - rna_form.seq.count(bpforms.rna_alphabet.monomers.C) \ - rna_form.seq.count(bpforms.rna_alphabet.monomers.G) \ - rna_form.seq.count(bpforms.rna_alphabet.monomers.U) results_dict['Number of modified A'] = canonical_seq.count('A') - rna_form.seq.count(bpforms.rna_alphabet.monomers.A) results_dict['Number of modified C'] = canonical_seq.count('C') - rna_form.seq.count(bpforms.rna_alphabet.monomers.C) results_dict['Number of modified G'] = canonical_seq.count('G') - rna_form.seq.count(bpforms.rna_alphabet.monomers.G) results_dict['Number of modified U'] = canonical_seq.count('U') - rna_form.seq.count(bpforms.rna_alphabet.monomers.U) if unsupported_codes: results_dict['BpForms errors'] = 'MODOMICS sequence uses monomeric forms {}'.format( ', '.join(unsupported_codes)) else: results_dict['Formula'] = str(rna_form.get_formula()) results_dict['Molecular weight'] = rna_form.get_mol_wt() results_dict['Charge'] = rna_form.get_charge() canonical_form = bpforms.RnaForm().from_str(canonical_seq) results_dict['Canonical formula'] = str(canonical_form.get_formula()) results_dict['Canonical molecular weight'] = canonical_form.get_mol_wt() results_dict['Canonical charge'] = canonical_form.get_charge() results_dict['Extra formula'] = str(rna_form.get_formula() - canonical_form.get_formula()) results_dict['Extra molecular weight'] = rna_form.get_mol_wt() - canonical_form.get_mol_wt() results_dict['Extra charge'] = rna_form.get_charge() - canonical_form.get_charge() results_dict['BpForms errors'] = ' '.join(rna_form.validate())
def test_serialize_bpforms(self): seq = 'ACGU' mol = bpforms.RnaForm().from_str(seq) s = obj_tables.chem.ChemicalStructure(mol) self.assertEqual(s.serialize(), '{}/{}: {}'.format('bpforms', 'rna', seq))
tot_rna = 375000 doubling_time = 45. # min half_life = 45. # min monomer_freq = {} can_monomer_freq = {} monomer_codes = { monomer: code for code, monomer in bpforms.rna_alphabet.monomers.items() } tot_copies = 0 with open('examples/modomics_trna_copy_numbers.csv', 'r') as file: for rna in csv.DictReader(file, dialect='excel'): form = bpforms.RnaForm().from_str(rna['Sequence (BpForms)']) copies = float(rna['Copies per cell']) tot_copies += copies for monomer in form.seq: if monomer not in canonical_monomers: if monomer not in monomer_freq: monomer_freq[monomer] = 0 monomer_freq[monomer] += copies can_code = monomer.get_canonical_code(monomer_codes) if can_code not in can_monomer_freq: can_monomer_freq[can_code] = 0 can_monomer_freq[can_code] += copies
# verify non-canonical sequences are consistent with the canonical sequences assert seq.replace('-', '') == form.get_canonical_seq() return form ''' rRNA ''' rrna_types = ['5.8S', '18S', '28S'] for rrna_type in rrna_types: filename = 'examples/homo_sapiens_rna/{} all seqs.fasta'.format(rrna_type) seqs = [str(record.seq) for record in SeqIO.parse(filename, "fasta")] ref_nc_seq = seqs[0] ref_form = bpforms.RnaForm().from_str(ref_nc_seq) ref_seq = seqs[1] seqs = seqs[1:] # map curated modifications onto sequences forms = [] for seq in seqs: forms.append(liftover_mods(ref_seq, ref_nc_seq, seq)) # save non-canonical sequences with open( 'examples/homo_sapiens_rna/{} nc alignment.txt'.format(rrna_type), 'w') as file: for form in forms: file.write(str(form) + '\n') '''
def run_rrna(session, modomics_short_code_to_monomer, monomer_codes, out_filename): response = session.get(URL, params={ 'RNA_type': 'rRNA', 'RNA_subtype': 'all', 'organism': 'all species', 'vis_type': 'Modomics symbols', }) response.raise_for_status() doc = bs4.BeautifulSoup(response.text, 'lxml') table = doc.find('table', {'id': 'tseq'}) tbody = table.find('tbody') rows = tbody.find_all('tr') rna_forms = [] for row in rows: if not isinstance(row, bs4.element.Tag): continue cells = row.find_all('td') rna_form = bpforms.RnaForm() unsupported_codes = set() for child in cells[5].children: if child.name is None or child.name == 'span': if child.name is None: text = str(child) else: text = child.text for code in text.strip().replace('-', '').replace('_', ''): monomer = modomics_short_code_to_monomer.get(code, None) if monomer is None: unsupported_codes.add(code) monomer = bpforms.Monomer(id=code) else: monomer_codes[code] = monomer rna_form.seq.append(monomer) elif child.name == 'a': code = child.get('href').replace('/modomics/modifications/', '') monomer = modomics_short_code_to_monomer.get(code, None) if monomer is None: unsupported_codes.add(code) monomer = bpforms.Monomer(id=code) else: monomer_codes[code] = monomer rna_form.seq.append(monomer) else: raise Exception('Unsupported child {}'.format(child.name)) rna_forms.append({ 'GenBank': cells[0].find('a').text.strip(), 'Organism': cells[3].text.strip(), 'Organellum': cells[4].text.strip(), 'Type': cells[2].text.strip(), 'Sequence (MODOMICS)': cells[5].text.strip().replace('-', '').replace('_', ''), }) analyze_form(rna_form, unsupported_codes, rna_forms[-1]) # save results to tab-separated file save_results(rna_forms, ['GenBank', 'Type'], out_filename) return rna_forms
def run_trna(session, modomics_short_code_to_monomer, monomer_codes, out_filename): response = session.get(URL, params={ 'RNA_type': 'tRNA', 'RNA_subtype': 'all', 'organism': 'all species', 'vis_type': 'Modomics symbols', }) response.raise_for_status() doc = bs4.BeautifulSoup(response.text, 'lxml') table = doc.find('table', {'id': 'tseq'}) tbody = table.find('tbody') rows = tbody.find_all('tr') rna_forms = [] code_freq = {} canonical_code_freq = {'A': 0, 'C': 0, 'G': 0, 'U': 0} for row in rows: cells = row.find_all('td') rna_form = bpforms.RnaForm() unsupported_codes = set() for child in cells[5].children: if child.name is None or child.name == 'span': if child.name is None: text = str(child) else: text = child.text for code in text.strip().replace('-', '').replace('_', ''): monomer = modomics_short_code_to_monomer.get(code, None) if monomer is None: unsupported_codes.add(code) monomer = bpforms.Monomer(id=code) else: monomer_codes[code] = monomer if code not in code_freq: code_freq[code] = 0 code_freq[code] += 1 rna_form.seq.append(monomer) elif child.name == 'a': code = child.get('href').replace('/modomics/modifications/', '') monomer = modomics_short_code_to_monomer.get(code, None) if monomer is None: unsupported_codes.add(code) monomer = bpforms.Monomer(id=code) else: monomer_codes[code] = monomer if code not in code_freq: code_freq[code] = 0 code_freq[code] += 1 rna_form.seq.append(monomer) else: raise Exception('Unsupported child {}'.format(child.name)) rna_forms.append({ 'Amino acid type': cells[1].text.strip(), 'Anticodon': cells[2].text.strip(), 'Organism': cells[3].text.strip(), 'Organellum': cells[4].text.strip(), 'Sequence (MODOMICS)': cells[5].text.strip().replace('-', '').replace('_', ''), }) analyze_form(rna_form, unsupported_codes, rna_forms[-1]) canonical_code_freq['A'] += \ rna_forms[-1]['Sequence (IUPAC)'].count('A') \ - rna_form.seq.count(bpforms.rna_alphabet.monomers.A) canonical_code_freq['C'] += \ rna_forms[-1]['Sequence (IUPAC)'].count('C') \ - rna_form.seq.count(bpforms.rna_alphabet.monomers.C) canonical_code_freq['G'] += \ rna_forms[-1]['Sequence (IUPAC)'].count('G') \ - rna_form.seq.count(bpforms.rna_alphabet.monomers.G) canonical_code_freq['U'] += \ rna_forms[-1]['Sequence (IUPAC)'].count('U') \ - rna_form.seq.count(bpforms.rna_alphabet.monomers.U) # save results to tab-separated file save_results(rna_forms, ['Amino acid type', 'Anticodon'], out_filename) with open(os.path.join('examples', 'modomics.trna.canonical-code-freq.tsv'), 'w') as file: writer = csv.DictWriter(file, fieldnames=['Code', 'Frequency'], dialect='excel-tab') writer.writeheader() for code, freq in canonical_code_freq.items(): writer.writerow({'Code': code, 'Frequency': freq}) with open(os.path.join('examples', 'modomics.trna.code-freq.tsv'), 'w') as file: writer = csv.DictWriter(file, fieldnames=['Code', 'Frequency'], dialect='excel-tab') writer.writeheader() for code, freq in code_freq.items(): writer.writerow({'Code': code, 'Frequency': freq}) return rna_forms, canonical_code_freq, code_freq
def post(self): ret = {} warnings = [] args = bcform_ns.payload # print(args) # get arguments form = args['form'] arg_subunits = args.get('subunits', None) # validate form try: bc_form = bcforms.core.BcForm().from_str(form) except Exception as error: flask_restplus.abort(400, 'Form is invalid', errors={'form': str(error)}) errors = bc_form.validate() if errors: flask_restplus.abort(400, 'Form is invalid', errors={'form': '. '.join(errors)}) # validate input subunit properties sum_length = 0 if arg_subunits is not None: for subunit in arg_subunits: # check if name is in the form subunit_id = subunit['name'] if subunit_id in [subunit.id for subunit in bc_form.subunits]: # check if encoding and structure are present at the same time if ('encoding' in subunit) and ('structure' in subunit): # if encoding and structure both present, check if encoding is known encoding = subunit['encoding'].strip() if encoding == 'bpforms.ProteinForm': try: subunit_structure = bpforms.ProteinForm( ).from_str(subunit['structure']) sum_length += len( subunit_structure ) * bc_form.get_subunit_attribute( subunit_id, 'stoichiometry') bc_form.set_subunit_attribute( subunit_id, 'structure', subunit_structure) except Exception as error: flask_restplus.abort( 400, 'Unable to parse bpforms.ProteinForm', errors={'structure': str(error)}) elif encoding == 'bpforms.DnaForm': try: subunit_structure = bpforms.DnaForm().from_str( subunit['structure']) sum_length += len( subunit_structure ) * bc_form.get_subunit_attribute( subunit_id, 'stoichiometry') bc_form.set_subunit_attribute( subunit_id, 'structure', subunit_structure) except Exception as error: flask_restplus.abort( 400, 'Unable to parse bpforms.DnaForm', errors={'structure': str(error)}) elif encoding == 'bpforms.RnaForm': try: subunit_structure = bpforms.RnaForm().from_str( subunit['structure']) sum_length += len( subunit_structure ) * bc_form.get_subunit_attribute( subunit_id, 'stoichiometry') bc_form.set_subunit_attribute( subunit_id, 'structure', subunit_structure) except Exception as error: flask_restplus.abort( 400, 'Unable to parse bpforms.RnaForm', errors={'structure': str(error)}) elif encoding == 'smiles' or encoding == 'SMILES' or encoding == 'smi' or encoding == 'SMI': try: bc_form.set_subunit_attribute( subunit_id, 'structure', subunit['structure']) except Exception as error: flask_restplus.abort( 400, 'Unable to parse SMILES string', errors={'structure': str(error)}) # else if one is present but not the other, report error elif ('encoding' in subunit) ^ ('structure' in subunit): flask_restplus.abort( 400, 'One of encoding and structure is present but not both' ) # when neither encoding nor structure is present else: # check formula if 'formula' in subunit: try: bc_form.set_subunit_attribute( subunit_id, 'formula', subunit['formula']) except Exception as error: flask_restplus.abort( 400, 'Unable to parse formula', errors={'formula': str(error)}) elif 'mol_wt' in subunit: try: bc_form.set_subunit_attribute( subunit_id, 'mol_wt', subunit['mol_wt']) except Exception as error: flask_restplus.abort( 400, 'Unable to parse mol_wt', errors={'mol_wt': str(error)}) # check charge if 'charge' in subunit: try: bc_form.set_subunit_attribute( subunit_id, 'charge', subunit['charge']) except Exception as error: flask_restplus.abort( 400, 'Unable to parse charge', errors={'charge': str(error)}) else: flask_restplus.abort(400, 'Subunit name not in BcForm', errors={'subunit': subunit_id}) ret['form'] = str(bc_form) if sum_length <= max_len_get_structure: try: ret['structure'] = bc_form.export() except Exception: pass else: warnings.append( 'The sum of length of bpforms-encoded subunits is {}, which exceeds the max length limit {}.' .format(sum_length, max_len_get_structure)) ret['structure'] = None try: ret['formula'] = str(bc_form.get_formula()) except Exception: pass try: ret['mol_wt'] = bc_form.get_mol_wt() except Exception: pass try: ret['charge'] = bc_form.get_charge() except Exception: pass if len(warnings) > 0: ret['warnings'] = ' '.join(warnings) return ret