def calc_bpforms_props_with_python_api(seqs):
    # calculate properties
    seq_props = []
    for seq in seqs:
        form = bpforms.ProteinForm().from_str(seq['seq'])
        seq_props.append({
            'Species': seq['id'],
            'Formula': form.get_formula(),
            'Molecular weight': form.get_mol_wt(),
            'Charge': form.get_charge(),
            'Length': len(form.seq),
        })
    return seq_props
Beispiel #2
0
def gen_bpform(protein,
               pro_ids_to_bpform_monomers,
               monomer_codes,
               apply_processing=True,
               apply_modifications=True,
               include_annotations=True):
    """ Generate BpForm for a modified protein in PRO

    Args:
        protein (:obj:`dict`): term for modified protein
        pro_ids_to_bpform_monomers (:obj:`dict`): dictionary which maps ids of monomeric forms
            used by PRO to monomeric forms in the BpForms protein alphabet
        monomer_codes (:obj:`dict`): dictionary that maps monomers to their codes in the alphabet
        apply_processing (:obj:`bool`, optional): if :obj:`True`, include processing in proteoform
        apply_modifications (:obj:`bool`, optional): if :obj:`True`, include modifications in proteoform
        include_annotations (:obj:`bool`, optional): if :obj:`True`, include metadata about modified monomers

    Returns:
        :obj:`bpforms.ProteinForm`: BpForm for a term in PRO
    """
    form = bpforms.ProteinForm()
    monomers = bpforms.protein_alphabet.monomers

    # generate BpForm for unmodified sequence
    for base in protein['seq']:
        form.seq.append(monomers[base])

    # apply processing
    modifications = copy.deepcopy(protein['modifications'])
    seq = protein['seq']
    if apply_processing and protein['processing']:
        procesed_seq = []
        seq = ''
        for processing in protein['processing']:
            procesed_seq.extend(form.seq[processing['start'] -
                                         1:processing['end']])
            seq += protein['seq'][processing['start'] - 1:processing['end']]
        form.seq = procesed_seq

        for modification in modifications:
            modification['processed_positions'] = []
            for position in modification['positions']:
                seq_len = 0
                processed_position = None
                for processing in protein['processing']:
                    if position >= processing[
                            'start'] and position <= processing['end']:
                        processed_position = seq_len + position - processing[
                            'start'] + 1
                        break
                    seq_len += processing['end'] - processing['start'] + 1
                if processed_position is not None:
                    modification['processed_positions'].append(
                        processed_position)
    else:
        for modification in modifications:
            modification['processed_positions'] = modification['positions']

    # apply modifications
    if apply_modifications:
        concrete = True
        protein['modified_errors'] = []

        for modification in modifications:
            monomer = pro_ids_to_bpform_monomers[
                modification['monomer']]['mod']
            origin = pro_ids_to_bpform_monomers[
                modification['monomer']]['origin']

            if modification['monomer'].startswith('CHEBI:'):
                mod_ns = 'chebi'
            elif modification['monomer'].startswith('MOD:'):
                mod_ns = 'mod'
            elif modification['monomer'].startswith('PR:'):
                mod_ns = 'pr'
            elif modification['monomer'].startswith('UniCarbKB:'):
                mod_ns = 'unicarbkb'
            else:
                raise ValueError('Unsupported identifier {}'.format(
                    modification['monomer']))

            if modification['monomer'] == 'PR:000026291':
                if include_annotations:
                    monomer = bpforms.Monomer().from_dict(
                        monomers[modification['residue']].to_dict(
                            alphabet=bpforms.protein_alphabet),
                        alphabet=bpforms.protein_alphabet)
                else:
                    monomer = bpforms.Monomer()
                monomer.id = None
                monomer.name = None
                monomer.synonyms = []
                monomer.identifiers = [
                    bpforms.Identifier('pr', modification['monomer'])
                ]
                monomer.comments = None

            elif modification['monomer'].startswith('CHEBI:'):
                if include_annotations:
                    monomer = bpforms.Monomer().from_dict(
                        monomers[modification['residue']].to_dict(
                            alphabet=bpforms.protein_alphabet),
                        alphabet=bpforms.protein_alphabet)
                else:
                    monomer = bpforms.Monomer()
                monomer.id = None
                monomer.name = None
                monomer.synonyms = []
                monomer.identifiers = [
                    bpforms.Identifier('chebi', modification['monomer'])
                ]
                monomer.comments = None

            elif monomer is None:
                concrete = False

                monomer = bpforms.Monomer(identifiers=[
                    bpforms.Identifier(mod_ns, modification['monomer'])
                ])

            if modification['positions']:
                for position in modification['processed_positions']:
                    if form.seq[position - 1] == monomers[seq[position - 1]]:
                        if monomer not in bpforms.protein_alphabet.monomers.values(
                        ):
                            monomer.base_monomers = [form.seq[position - 1]]
                        form.seq[position - 1] = monomer
                    else:
                        protein['modified_errors'].append(
                            'Unable to set monomeric form at position {}'.
                            format(position))

            elif modification['residue']:
                concrete = False

                if include_annotations:
                    monomer2 = bpforms.Monomer().from_dict(
                        monomer.to_dict(alphabet=bpforms.protein_alphabet),
                        alphabet=bpforms.protein_alphabet)
                else:
                    monomer2 = bpforms.Monomer()
                monomer2.id = None
                monomer2.name = None
                monomer2.synonyms = []
                monomer2.identifiers = [
                    bpforms.Identifier(mod_ns, modification['monomer'])
                ]
                monomer2.base_monomers = [
                    bpforms.protein_alphabet.monomers.get(
                        modification['positions'])
                ]

                monomer2.start_position = seq.find(modification['residue']) + 1
                monomer2.end_position = seq.rfind(modification['residue']) + 1
                set_monomer = False
                for i_monomer in range(monomer2.start_position,
                                       monomer2.end_position + 1):
                    if form.seq[i_monomer - 1] == monomers[seq[i_monomer - 1]]:
                        set_monomer = True
                        form.seq[i_monomer - 1] = monomer2
                        break
                if not set_monomer:
                    protein['modified_errors'].append(
                        'Unable to set monomeric form')
            else:
                concrete = False

                canonical_code = monomer.get_canonical_code(monomer_codes)
                if include_annotations:
                    monomer2 = bpforms.Monomer().from_dict(
                        monomer.to_dict(alphabet=bpforms.protein_alphabet),
                        alphabet=bpforms.protein_alphabet)
                else:
                    monomer2 = bpforms.Monomer()
                monomer2.id = None
                monomer2.name = None
                monomer2.synonyms = []
                monomer2.identifiers = [
                    bpforms.Identifier(mod_ns, modification['monomer'])
                ]
                monomer2.monomers_position = [
                    bpforms.protein_alphabet.monomers.get(code)
                    for code in origin
                ]

                if canonical_code and canonical_code != '?':
                    start_position = seq.find(canonical_code) + 1
                    end_position = seq.rfind(canonical_code) + 1
                    if start_position == 0:
                        protein['modified_errors'].append(
                            'Sequence does not contain residue {} for modification {}'
                            .format(canonical_code, modification['monomer']))
                    else:
                        monomer2.start_position = start_position
                        monomer2.end_position = end_position

                elif origin:
                    start_position = float('inf')
                    end_position = -float('inf')
                    for base in origin:
                        start_pos = seq.find(base) + 1
                        if start_pos > 0:
                            start_position = min(start_position, start_pos)

                        end_pos = seq.rfind(base) + 1
                        if end_pos > 0:
                            end_position = max(end_position, end_pos)

                    if numpy.isinf(start_position):
                        protein['modified_errors'].append(
                            'Sequence does not contain residues {} for modification {}'
                            .format(', '.join(origin),
                                    modification['monomer']))
                    else:
                        monomer2.start_position = start_position
                        monomer2.end_position = end_position

                else:
                    monomer2.start_position = 1
                    monomer2.end_position = len(seq)

                if monomer2.start_position:
                    set_monomer = False
                    for i_monomer in range(monomer2.start_position,
                                           monomer2.end_position + 1):
                        if form.seq[i_monomer - 1] == monomers[seq[i_monomer -
                                                                   1]]:
                            monomer2.base_monomers = [
                                bpforms.protein_alphabet.monomers.get(
                                    seq[i_monomer - 1])
                            ]
                            form.seq[i_monomer - 1] = monomer2
                            set_monomer = True
                            break
                    if not set_monomer:
                        protein['modified_errors'].append(
                            'Unable to set monomeric form')

    # crosslinks
    if protein['processing']:
        xlinks = []
        seq_len = 0
        protein['crosslinks'] = []
        protein['deletions'] = []
        for left, right in zip(protein['processing'][0:-1],
                               protein['processing'][1:]):
            seq_len += left['end'] - left['start'] + 1
            i_left = seq_len
            i_right = i_left + 1

            if left['end'] + 1 == right['start']:
                protein['crosslinks'].append(
                    ((left['end'], protein['seq'][left['end'] - 1]),
                     (right['start'], protein['seq'][right['start'] - 1])))
            else:
                protein['deletions'].append(
                    (left['end'] + 1, right['start'] - 1))

            if left['end'] + 1 != right['start']:
                continue

            #err = False
            # if protein['seq'][left['end'] - 1] != 'C' or form.seq[i_left - 1] != bpforms.protein_alphabet.monomers.C:
            #    err = True
            #    protein['modified_errors'].append('Disulfide bond site {}{} != C'.format(
            #        protein['seq'][left['end'] - 1], left['end']))
            # if protein['seq'][right['start'] - 1] != 'C' or form.seq[i_right - 1] != bpforms.protein_alphabet.monomers.C:
            #    err = True
            #    protein['modified_errors'].append('Disulfide bond site {}{} != C'.format(
            #        protein['seq'][right['start'] - 1], right['start']))
            #
            # if err:
            #    continue

            concrete = False

            i_left = '{}-{}'.format(
                seq_len - (left['end'] - left['start'] + 1) + 1, seq_len)
            i_right = '{}-{}'.format(
                seq_len + 1, seq_len + (right['end'] - right['start'] + 1))
            if apply_modifications:
                form.crosslinks.add(
                    bpforms.Bond(
                        #l_bond_atoms=[bpforms.Atom(bpforms.Monomer, 'S', position=11, monomer=i_left)],
                        #r_bond_atoms=[bpforms.Atom(bpforms.Monomer, 'S', position=11, monomer=i_right)],
                        #l_displaced_atoms=[bpforms.Atom(bpforms.Monomer, 'H', position=11, monomer=i_left)],
                        #r_displaced_atoms=[bpforms.Atom(bpforms.Monomer, 'H', position=11, monomer=i_right)],
                        comments=
                        'The polymer contains a disulfide bond between the ranges {} and {}'
                        .format(i_left, i_right), ))

    # validate
    if apply_modifications:
        protein['modified_concrete'] = concrete
        protein['modified_errors'].extend(form.validate())

    # return proteoform represented with BpForms
    return form
Beispiel #3
0
    def post(self):
        ret = {}
        warnings = []

        args = bcform_ns.payload

        # print(args)

        # get arguments
        form = args['form']
        arg_subunits = args.get('subunits', None)

        # validate form
        try:
            bc_form = bcforms.core.BcForm().from_str(form)
        except Exception as error:
            flask_restplus.abort(400,
                                 'Form is invalid',
                                 errors={'form': str(error)})

        errors = bc_form.validate()
        if errors:
            flask_restplus.abort(400,
                                 'Form is invalid',
                                 errors={'form': '. '.join(errors)})

        # validate input subunit properties
        sum_length = 0
        if arg_subunits is not None:
            for subunit in arg_subunits:

                # check if name is in the form
                subunit_id = subunit['name']
                if subunit_id in [subunit.id for subunit in bc_form.subunits]:

                    # check if encoding and structure are present at the same time
                    if ('encoding' in subunit) and ('structure' in subunit):
                        # if encoding and structure both present, check if encoding is known
                        encoding = subunit['encoding'].strip()
                        if encoding == 'bpforms.ProteinForm':
                            try:
                                subunit_structure = bpforms.ProteinForm(
                                ).from_str(subunit['structure'])
                                sum_length += len(
                                    subunit_structure
                                ) * bc_form.get_subunit_attribute(
                                    subunit_id, 'stoichiometry')
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'structure', subunit_structure)
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse bpforms.ProteinForm',
                                    errors={'structure': str(error)})
                        elif encoding == 'bpforms.DnaForm':
                            try:
                                subunit_structure = bpforms.DnaForm().from_str(
                                    subunit['structure'])
                                sum_length += len(
                                    subunit_structure
                                ) * bc_form.get_subunit_attribute(
                                    subunit_id, 'stoichiometry')
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'structure', subunit_structure)
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse bpforms.DnaForm',
                                    errors={'structure': str(error)})
                        elif encoding == 'bpforms.RnaForm':
                            try:
                                subunit_structure = bpforms.RnaForm().from_str(
                                    subunit['structure'])
                                sum_length += len(
                                    subunit_structure
                                ) * bc_form.get_subunit_attribute(
                                    subunit_id, 'stoichiometry')
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'structure', subunit_structure)
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse bpforms.RnaForm',
                                    errors={'structure': str(error)})
                        elif encoding == 'smiles' or encoding == 'SMILES' or encoding == 'smi' or encoding == 'SMI':
                            try:
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'structure',
                                    subunit['structure'])
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse SMILES string',
                                    errors={'structure': str(error)})

                    # else if one is present but not the other, report error
                    elif ('encoding' in subunit) ^ ('structure' in subunit):
                        flask_restplus.abort(
                            400,
                            'One of encoding and structure is present but not both'
                        )

                    # when neither encoding nor structure is present
                    else:
                        # check formula
                        if 'formula' in subunit:
                            try:
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'formula', subunit['formula'])
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse formula',
                                    errors={'formula': str(error)})
                        elif 'mol_wt' in subunit:
                            try:
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'mol_wt', subunit['mol_wt'])
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse mol_wt',
                                    errors={'mol_wt': str(error)})

                        # check charge
                        if 'charge' in subunit:
                            try:
                                bc_form.set_subunit_attribute(
                                    subunit_id, 'charge', subunit['charge'])
                            except Exception as error:
                                flask_restplus.abort(
                                    400,
                                    'Unable to parse charge',
                                    errors={'charge': str(error)})

                else:
                    flask_restplus.abort(400,
                                         'Subunit name not in BcForm',
                                         errors={'subunit': subunit_id})

        ret['form'] = str(bc_form)

        if sum_length <= max_len_get_structure:
            try:
                ret['structure'] = bc_form.export()
            except Exception:
                pass
        else:
            warnings.append(
                'The sum of length of bpforms-encoded subunits is {}, which exceeds the max length limit {}.'
                .format(sum_length, max_len_get_structure))
            ret['structure'] = None

        try:
            ret['formula'] = str(bc_form.get_formula())
        except Exception:
            pass

        try:
            ret['mol_wt'] = bc_form.get_mol_wt()
        except Exception:
            pass

        try:
            ret['charge'] = bc_form.get_charge()
        except Exception:
            pass

        if len(warnings) > 0:
            ret['warnings'] = ' '.join(warnings)

        return ret
Beispiel #4
0
# Import libraries
import bcforms
import bpforms

# Create complexes from their string representations
form_1 = bcforms.BcForm().from_str('2 * subunit_a + 3 * subunit_b')
form_1.set_subunit_attribute('subunit_a', 'structure',
                             bpforms.ProteinForm().from_str('CAAAAAAAA'))
form_1.set_subunit_attribute('subunit_b', 'structure',
                             bpforms.ProteinForm().from_str('AAAAAAAAC'))

form_2 = bcforms.BcForm().from_str(
    '2 * subunit_a'
    '| x-link: [type: disulfide | l: subunit_a(1)-1 | r: subunit_a(2)-1]')
form_2.set_subunit_attribute('subunit_a', 'structure',
                             bpforms.ProteinForm().from_str('CAAAAAAAA'))

# Create complexes programmatically
form_1_b = bcforms.BcForm()
form_1_b.subunits.append(
    bcforms.core.Subunit('subunit_a', 2,
                         bpforms.ProteinForm().from_str('CAAAAAAAA')))
form_1_b.subunits.append(
    bcforms.core.Subunit('subunit_b', 3,
                         bpforms.ProteinForm().from_str('AAAAAAAAC')))

form_2_b = bcforms.BcForm()
subunit = bcforms.core.Subunit('subunit_a', 2,
                               bpforms.ProteinForm().from_str('CAAAAAAAA'))
form_2_b.subunits.append(subunit)
form_2_b.crosslinks.append(