예제 #1
0
파일: curate.py 프로젝트: bwbai/bpforms
def liftover_mods(ref_seq, ref_nc_seq, seq, can_monomers=can_monomers):
    ref_form = bpforms.RnaForm().from_str(ref_nc_seq)
    form = bpforms.RnaForm()
    i_nc_nt = 0
    for i_nt, ref_monomer in enumerate(ref_seq):
        if i_nc_nt < len(ref_form.seq):
            ref_nc_monomer = ref_form.seq[i_nc_nt]
        else:
            ref_nc_monomer = None
        if ref_monomer != '-':
            i_nc_nt += 1

        monomer = seq[i_nt]
        if monomer == '-':
            continue
        elif ref_nc_monomer not in can_monomers and monomer == ref_monomer:
            form.seq.append(ref_nc_monomer)
        elif monomer == 'N':
            form.seq.append(bpforms.Monomer(id='N'))
        else:
            form.seq.append(bpforms.rna_alphabet.monomers.get(monomer))

    # verify non-canonical sequences are consistent with the canonical sequences
    assert seq.replace('-', '') == form.get_canonical_seq()

    return form
예제 #2
0
파일: modomics.py 프로젝트: bwbai/bpforms
def analyze_form(rna_form, unsupported_codes, results_dict):
    results_dict['Sequence (BpForms)'] = str(rna_form)
    results_dict['Sequence (IUPAC)'] = canonical_seq = rna_form.get_canonical_seq()
    results_dict['Length'] = len(rna_form.seq)

    results_dict['Number of modifications'] = len(rna_form.seq) \
        - rna_form.seq.count(bpforms.rna_alphabet.monomers.A) \
        - rna_form.seq.count(bpforms.rna_alphabet.monomers.C) \
        - rna_form.seq.count(bpforms.rna_alphabet.monomers.G) \
        - rna_form.seq.count(bpforms.rna_alphabet.monomers.U)
    results_dict['Number of modified A'] = canonical_seq.count('A') - rna_form.seq.count(bpforms.rna_alphabet.monomers.A)
    results_dict['Number of modified C'] = canonical_seq.count('C') - rna_form.seq.count(bpforms.rna_alphabet.monomers.C)
    results_dict['Number of modified G'] = canonical_seq.count('G') - rna_form.seq.count(bpforms.rna_alphabet.monomers.G)
    results_dict['Number of modified U'] = canonical_seq.count('U') - rna_form.seq.count(bpforms.rna_alphabet.monomers.U)

    if unsupported_codes:
        results_dict['BpForms errors'] = 'MODOMICS sequence uses monomeric forms {}'.format(
            ', '.join(unsupported_codes))
    else:
        results_dict['Formula'] = str(rna_form.get_formula())
        results_dict['Molecular weight'] = rna_form.get_mol_wt()
        results_dict['Charge'] = rna_form.get_charge()

        canonical_form = bpforms.RnaForm().from_str(canonical_seq)
        results_dict['Canonical formula'] = str(canonical_form.get_formula())
        results_dict['Canonical molecular weight'] = canonical_form.get_mol_wt()
        results_dict['Canonical charge'] = canonical_form.get_charge()

        results_dict['Extra formula'] = str(rna_form.get_formula() - canonical_form.get_formula())
        results_dict['Extra molecular weight'] = rna_form.get_mol_wt() - canonical_form.get_mol_wt()
        results_dict['Extra charge'] = rna_form.get_charge() - canonical_form.get_charge()

        results_dict['BpForms errors'] = ' '.join(rna_form.validate())
예제 #3
0
 def test_serialize_bpforms(self):
     seq = 'ACGU'
     mol = bpforms.RnaForm().from_str(seq)
     s = obj_tables.chem.ChemicalStructure(mol)
     self.assertEqual(s.serialize(),
                      '{}/{}: {}'.format('bpforms', 'rna', seq))
예제 #4
0
tot_rna = 375000
doubling_time = 45.  # min
half_life = 45.  # min

monomer_freq = {}
can_monomer_freq = {}
monomer_codes = {
    monomer: code
    for code, monomer in bpforms.rna_alphabet.monomers.items()
}
tot_copies = 0

with open('examples/modomics_trna_copy_numbers.csv', 'r') as file:
    for rna in csv.DictReader(file, dialect='excel'):
        form = bpforms.RnaForm().from_str(rna['Sequence (BpForms)'])
        copies = float(rna['Copies per cell'])

        tot_copies += copies

        for monomer in form.seq:
            if monomer not in canonical_monomers:
                if monomer not in monomer_freq:
                    monomer_freq[monomer] = 0
                monomer_freq[monomer] += copies

                can_code = monomer.get_canonical_code(monomer_codes)
                if can_code not in can_monomer_freq:
                    can_monomer_freq[can_code] = 0
                can_monomer_freq[can_code] += copies
예제 #5
0
파일: curate.py 프로젝트: bwbai/bpforms
    # verify non-canonical sequences are consistent with the canonical sequences
    assert seq.replace('-', '') == form.get_canonical_seq()

    return form


'''
rRNA
'''
rrna_types = ['5.8S', '18S', '28S']

for rrna_type in rrna_types:
    filename = 'examples/homo_sapiens_rna/{} all seqs.fasta'.format(rrna_type)
    seqs = [str(record.seq) for record in SeqIO.parse(filename, "fasta")]
    ref_nc_seq = seqs[0]
    ref_form = bpforms.RnaForm().from_str(ref_nc_seq)
    ref_seq = seqs[1]
    seqs = seqs[1:]

    # map curated modifications onto sequences
    forms = []
    for seq in seqs:
        forms.append(liftover_mods(ref_seq, ref_nc_seq, seq))

    # save non-canonical sequences
    with open(
            'examples/homo_sapiens_rna/{} nc alignment.txt'.format(rrna_type),
            'w') as file:
        for form in forms:
            file.write(str(form) + '\n')
'''
예제 #6
0
파일: modomics.py 프로젝트: bwbai/bpforms
def run_rrna(session, modomics_short_code_to_monomer, monomer_codes, out_filename):
    response = session.get(URL, params={
        'RNA_type': 'rRNA',
        'RNA_subtype': 'all',
        'organism': 'all species',
        'vis_type': 'Modomics symbols',
    })

    response.raise_for_status()

    doc = bs4.BeautifulSoup(response.text, 'lxml')
    table = doc.find('table', {'id': 'tseq'})
    tbody = table.find('tbody')
    rows = tbody.find_all('tr')
    rna_forms = []
    for row in rows:
        if not isinstance(row, bs4.element.Tag):
            continue

        cells = row.find_all('td')

        rna_form = bpforms.RnaForm()
        unsupported_codes = set()
        for child in cells[5].children:
            if child.name is None or child.name == 'span':
                if child.name is None:
                    text = str(child)
                else:
                    text = child.text

                for code in text.strip().replace('-', '').replace('_', ''):
                    monomer = modomics_short_code_to_monomer.get(code, None)
                    if monomer is None:
                        unsupported_codes.add(code)
                        monomer = bpforms.Monomer(id=code)
                    else:
                        monomer_codes[code] = monomer
                    rna_form.seq.append(monomer)
            elif child.name == 'a':
                code = child.get('href').replace('/modomics/modifications/', '')
                monomer = modomics_short_code_to_monomer.get(code, None)
                if monomer is None:
                    unsupported_codes.add(code)
                    monomer = bpforms.Monomer(id=code)
                else:
                    monomer_codes[code] = monomer
                rna_form.seq.append(monomer)
            else:
                raise Exception('Unsupported child {}'.format(child.name))

        rna_forms.append({
            'GenBank': cells[0].find('a').text.strip(),
            'Organism': cells[3].text.strip(),
            'Organellum': cells[4].text.strip(),
            'Type': cells[2].text.strip(),
            'Sequence (MODOMICS)': cells[5].text.strip().replace('-', '').replace('_', ''),
        })
        analyze_form(rna_form, unsupported_codes, rna_forms[-1])

    # save results to tab-separated file
    save_results(rna_forms, ['GenBank', 'Type'], out_filename)

    return rna_forms
예제 #7
0
파일: modomics.py 프로젝트: bwbai/bpforms
def run_trna(session, modomics_short_code_to_monomer, monomer_codes, out_filename):
    response = session.get(URL, params={
        'RNA_type': 'tRNA',
        'RNA_subtype': 'all',
        'organism': 'all species',
        'vis_type': 'Modomics symbols',
    })
    response.raise_for_status()

    doc = bs4.BeautifulSoup(response.text, 'lxml')
    table = doc.find('table', {'id': 'tseq'})
    tbody = table.find('tbody')
    rows = tbody.find_all('tr')
    rna_forms = []

    code_freq = {}
    canonical_code_freq = {'A': 0, 'C': 0, 'G': 0, 'U': 0}
    for row in rows:
        cells = row.find_all('td')

        rna_form = bpforms.RnaForm()
        unsupported_codes = set()
        for child in cells[5].children:
            if child.name is None or child.name == 'span':
                if child.name is None:
                    text = str(child)
                else:
                    text = child.text

                for code in text.strip().replace('-', '').replace('_', ''):
                    monomer = modomics_short_code_to_monomer.get(code, None)
                    if monomer is None:
                        unsupported_codes.add(code)
                        monomer = bpforms.Monomer(id=code)
                    else:
                        monomer_codes[code] = monomer
                        if code not in code_freq:
                            code_freq[code] = 0
                        code_freq[code] += 1
                    rna_form.seq.append(monomer)
            elif child.name == 'a':
                code = child.get('href').replace('/modomics/modifications/', '')
                monomer = modomics_short_code_to_monomer.get(code, None)
                if monomer is None:
                    unsupported_codes.add(code)
                    monomer = bpforms.Monomer(id=code)
                else:
                    monomer_codes[code] = monomer
                    if code not in code_freq:
                        code_freq[code] = 0
                    code_freq[code] += 1
                rna_form.seq.append(monomer)
            else:
                raise Exception('Unsupported child {}'.format(child.name))

        rna_forms.append({
            'Amino acid type': cells[1].text.strip(),
            'Anticodon': cells[2].text.strip(),
            'Organism': cells[3].text.strip(),
            'Organellum': cells[4].text.strip(),
            'Sequence (MODOMICS)': cells[5].text.strip().replace('-', '').replace('_', ''),
        })
        analyze_form(rna_form, unsupported_codes, rna_forms[-1])

        canonical_code_freq['A'] += \
            rna_forms[-1]['Sequence (IUPAC)'].count('A') \
            - rna_form.seq.count(bpforms.rna_alphabet.monomers.A)
        canonical_code_freq['C'] += \
            rna_forms[-1]['Sequence (IUPAC)'].count('C') \
            - rna_form.seq.count(bpforms.rna_alphabet.monomers.C)
        canonical_code_freq['G'] += \
            rna_forms[-1]['Sequence (IUPAC)'].count('G') \
            - rna_form.seq.count(bpforms.rna_alphabet.monomers.G)
        canonical_code_freq['U'] += \
            rna_forms[-1]['Sequence (IUPAC)'].count('U') \
            - rna_form.seq.count(bpforms.rna_alphabet.monomers.U)

    # save results to tab-separated file
    save_results(rna_forms, ['Amino acid type', 'Anticodon'], out_filename)

    with open(os.path.join('examples', 'modomics.trna.canonical-code-freq.tsv'), 'w') as file:
        writer = csv.DictWriter(file, fieldnames=['Code', 'Frequency'], dialect='excel-tab')
        writer.writeheader()
        for code, freq in canonical_code_freq.items():
            writer.writerow({'Code': code, 'Frequency': freq})

    with open(os.path.join('examples', 'modomics.trna.code-freq.tsv'), 'w') as file:
        writer = csv.DictWriter(file, fieldnames=['Code', 'Frequency'], dialect='excel-tab')
        writer.writeheader()
        for code, freq in code_freq.items():
            writer.writerow({'Code': code, 'Frequency': freq})

    return rna_forms, canonical_code_freq, code_freq
예제 #8
0
파일: rest.py 프로젝트: KarrLab/bcforms
    def post(self):
        ret = {}
        warnings = []

        args = bcform_ns.payload

        # print(args)

        # get arguments
        form = args['form']
        arg_subunits = args.get('subunits', None)

        # validate form
        try:
            bc_form = bcforms.core.BcForm().from_str(form)
        except Exception as error:
            flask_restplus.abort(400,
                                 'Form is invalid',
                                 errors={'form': str(error)})

        errors = bc_form.validate()
        if errors:
            flask_restplus.abort(400,
                                 'Form is invalid',
                                 errors={'form': '. '.join(errors)})

        # validate input subunit properties
        sum_length = 0
        if arg_subunits is not None:
            for subunit in arg_subunits:

                # check if name is in the form
                subunit_id = subunit['name']
                if subunit_id in [subunit.id for subunit in bc_form.subunits]:

                    # check if encoding and structure are present at the same time
                    if ('encoding' in subunit) and ('structure' in subunit):
                        # if encoding and structure both present, check if encoding is known
                        encoding = subunit['encoding'].strip()
                        if encoding == 'bpforms.ProteinForm':
                            try:
                                subunit_structure = bpforms.ProteinForm(
                                ).from_str(subunit['structure'])
                                sum_length += len(
                                    subunit_structure
                                ) * bc_form.get_subunit_attribute(
                                    subunit_id, 'stoichiometry')
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'structure', subunit_structure)
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse bpforms.ProteinForm',
                                    errors={'structure': str(error)})
                        elif encoding == 'bpforms.DnaForm':
                            try:
                                subunit_structure = bpforms.DnaForm().from_str(
                                    subunit['structure'])
                                sum_length += len(
                                    subunit_structure
                                ) * bc_form.get_subunit_attribute(
                                    subunit_id, 'stoichiometry')
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'structure', subunit_structure)
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse bpforms.DnaForm',
                                    errors={'structure': str(error)})
                        elif encoding == 'bpforms.RnaForm':
                            try:
                                subunit_structure = bpforms.RnaForm().from_str(
                                    subunit['structure'])
                                sum_length += len(
                                    subunit_structure
                                ) * bc_form.get_subunit_attribute(
                                    subunit_id, 'stoichiometry')
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'structure', subunit_structure)
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse bpforms.RnaForm',
                                    errors={'structure': str(error)})
                        elif encoding == 'smiles' or encoding == 'SMILES' or encoding == 'smi' or encoding == 'SMI':
                            try:
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'structure',
                                    subunit['structure'])
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse SMILES string',
                                    errors={'structure': str(error)})

                    # else if one is present but not the other, report error
                    elif ('encoding' in subunit) ^ ('structure' in subunit):
                        flask_restplus.abort(
                            400,
                            'One of encoding and structure is present but not both'
                        )

                    # when neither encoding nor structure is present
                    else:
                        # check formula
                        if 'formula' in subunit:
                            try:
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'formula', subunit['formula'])
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse formula',
                                    errors={'formula': str(error)})
                        elif 'mol_wt' in subunit:
                            try:
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'mol_wt', subunit['mol_wt'])
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse mol_wt',
                                    errors={'mol_wt': str(error)})

                        # check charge
                        if 'charge' in subunit:
                            try:
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'charge', subunit['charge'])
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse charge',
                                    errors={'charge': str(error)})

                else:
                    flask_restplus.abort(400,
                                         'Subunit name not in BcForm',
                                         errors={'subunit': subunit_id})

        ret['form'] = str(bc_form)

        if sum_length <= max_len_get_structure:
            try:
                ret['structure'] = bc_form.export()
            except Exception:
                pass
        else:
            warnings.append(
                'The sum of length of bpforms-encoded subunits is {}, which exceeds the max length limit {}.'
                .format(sum_length, max_len_get_structure))
            ret['structure'] = None

        try:
            ret['formula'] = str(bc_form.get_formula())
        except Exception:
            pass

        try:
            ret['mol_wt'] = bc_form.get_mol_wt()
        except Exception:
            pass

        try:
            ret['charge'] = bc_form.get_charge()
        except Exception:
            pass

        if len(warnings) > 0:
            ret['warnings'] = ' '.join(warnings)

        return ret