def calc_bpforms_props_with_python_api(seqs): # calculate properties seq_props = [] for seq in seqs: form = bpforms.ProteinForm().from_str(seq['seq']) seq_props.append({ 'Species': seq['id'], 'Formula': form.get_formula(), 'Molecular weight': form.get_mol_wt(), 'Charge': form.get_charge(), 'Length': len(form.seq), }) return seq_props
def gen_bpform(protein, pro_ids_to_bpform_monomers, monomer_codes, apply_processing=True, apply_modifications=True, include_annotations=True): """ Generate BpForm for a modified protein in PRO Args: protein (:obj:`dict`): term for modified protein pro_ids_to_bpform_monomers (:obj:`dict`): dictionary which maps ids of monomeric forms used by PRO to monomeric forms in the BpForms protein alphabet monomer_codes (:obj:`dict`): dictionary that maps monomers to their codes in the alphabet apply_processing (:obj:`bool`, optional): if :obj:`True`, include processing in proteoform apply_modifications (:obj:`bool`, optional): if :obj:`True`, include modifications in proteoform include_annotations (:obj:`bool`, optional): if :obj:`True`, include metadata about modified monomers Returns: :obj:`bpforms.ProteinForm`: BpForm for a term in PRO """ form = bpforms.ProteinForm() monomers = bpforms.protein_alphabet.monomers # generate BpForm for unmodified sequence for base in protein['seq']: form.seq.append(monomers[base]) # apply processing modifications = copy.deepcopy(protein['modifications']) seq = protein['seq'] if apply_processing and protein['processing']: procesed_seq = [] seq = '' for processing in protein['processing']: procesed_seq.extend(form.seq[processing['start'] - 1:processing['end']]) seq += protein['seq'][processing['start'] - 1:processing['end']] form.seq = procesed_seq for modification in modifications: modification['processed_positions'] = [] for position in modification['positions']: seq_len = 0 processed_position = None for processing in protein['processing']: if position >= processing[ 'start'] and position <= processing['end']: processed_position = seq_len + position - processing[ 'start'] + 1 break seq_len += processing['end'] - processing['start'] + 1 if processed_position is not None: modification['processed_positions'].append( processed_position) else: for modification in modifications: modification['processed_positions'] = modification['positions'] # apply modifications if apply_modifications: concrete = True protein['modified_errors'] = [] for modification in modifications: monomer = pro_ids_to_bpform_monomers[ modification['monomer']]['mod'] origin = pro_ids_to_bpform_monomers[ modification['monomer']]['origin'] if modification['monomer'].startswith('CHEBI:'): mod_ns = 'chebi' elif modification['monomer'].startswith('MOD:'): mod_ns = 'mod' elif modification['monomer'].startswith('PR:'): mod_ns = 'pr' elif modification['monomer'].startswith('UniCarbKB:'): mod_ns = 'unicarbkb' else: raise ValueError('Unsupported identifier {}'.format( modification['monomer'])) if modification['monomer'] == 'PR:000026291': if include_annotations: monomer = bpforms.Monomer().from_dict( monomers[modification['residue']].to_dict( alphabet=bpforms.protein_alphabet), alphabet=bpforms.protein_alphabet) else: monomer = bpforms.Monomer() monomer.id = None monomer.name = None monomer.synonyms = [] monomer.identifiers = [ bpforms.Identifier('pr', modification['monomer']) ] monomer.comments = None elif modification['monomer'].startswith('CHEBI:'): if include_annotations: monomer = bpforms.Monomer().from_dict( monomers[modification['residue']].to_dict( alphabet=bpforms.protein_alphabet), alphabet=bpforms.protein_alphabet) else: monomer = bpforms.Monomer() monomer.id = None monomer.name = None monomer.synonyms = [] monomer.identifiers = [ bpforms.Identifier('chebi', modification['monomer']) ] monomer.comments = None elif monomer is None: concrete = False monomer = bpforms.Monomer(identifiers=[ bpforms.Identifier(mod_ns, modification['monomer']) ]) if modification['positions']: for position in modification['processed_positions']: if form.seq[position - 1] == monomers[seq[position - 1]]: if monomer not in bpforms.protein_alphabet.monomers.values( ): monomer.base_monomers = [form.seq[position - 1]] form.seq[position - 1] = monomer else: protein['modified_errors'].append( 'Unable to set monomeric form at position {}'. format(position)) elif modification['residue']: concrete = False if include_annotations: monomer2 = bpforms.Monomer().from_dict( monomer.to_dict(alphabet=bpforms.protein_alphabet), alphabet=bpforms.protein_alphabet) else: monomer2 = bpforms.Monomer() monomer2.id = None monomer2.name = None monomer2.synonyms = [] monomer2.identifiers = [ bpforms.Identifier(mod_ns, modification['monomer']) ] monomer2.base_monomers = [ bpforms.protein_alphabet.monomers.get( modification['positions']) ] monomer2.start_position = seq.find(modification['residue']) + 1 monomer2.end_position = seq.rfind(modification['residue']) + 1 set_monomer = False for i_monomer in range(monomer2.start_position, monomer2.end_position + 1): if form.seq[i_monomer - 1] == monomers[seq[i_monomer - 1]]: set_monomer = True form.seq[i_monomer - 1] = monomer2 break if not set_monomer: protein['modified_errors'].append( 'Unable to set monomeric form') else: concrete = False canonical_code = monomer.get_canonical_code(monomer_codes) if include_annotations: monomer2 = bpforms.Monomer().from_dict( monomer.to_dict(alphabet=bpforms.protein_alphabet), alphabet=bpforms.protein_alphabet) else: monomer2 = bpforms.Monomer() monomer2.id = None monomer2.name = None monomer2.synonyms = [] monomer2.identifiers = [ bpforms.Identifier(mod_ns, modification['monomer']) ] monomer2.monomers_position = [ bpforms.protein_alphabet.monomers.get(code) for code in origin ] if canonical_code and canonical_code != '?': start_position = seq.find(canonical_code) + 1 end_position = seq.rfind(canonical_code) + 1 if start_position == 0: protein['modified_errors'].append( 'Sequence does not contain residue {} for modification {}' .format(canonical_code, modification['monomer'])) else: monomer2.start_position = start_position monomer2.end_position = end_position elif origin: start_position = float('inf') end_position = -float('inf') for base in origin: start_pos = seq.find(base) + 1 if start_pos > 0: start_position = min(start_position, start_pos) end_pos = seq.rfind(base) + 1 if end_pos > 0: end_position = max(end_position, end_pos) if numpy.isinf(start_position): protein['modified_errors'].append( 'Sequence does not contain residues {} for modification {}' .format(', '.join(origin), modification['monomer'])) else: monomer2.start_position = start_position monomer2.end_position = end_position else: monomer2.start_position = 1 monomer2.end_position = len(seq) if monomer2.start_position: set_monomer = False for i_monomer in range(monomer2.start_position, monomer2.end_position + 1): if form.seq[i_monomer - 1] == monomers[seq[i_monomer - 1]]: monomer2.base_monomers = [ bpforms.protein_alphabet.monomers.get( seq[i_monomer - 1]) ] form.seq[i_monomer - 1] = monomer2 set_monomer = True break if not set_monomer: protein['modified_errors'].append( 'Unable to set monomeric form') # crosslinks if protein['processing']: xlinks = [] seq_len = 0 protein['crosslinks'] = [] protein['deletions'] = [] for left, right in zip(protein['processing'][0:-1], protein['processing'][1:]): seq_len += left['end'] - left['start'] + 1 i_left = seq_len i_right = i_left + 1 if left['end'] + 1 == right['start']: protein['crosslinks'].append( ((left['end'], protein['seq'][left['end'] - 1]), (right['start'], protein['seq'][right['start'] - 1]))) else: protein['deletions'].append( (left['end'] + 1, right['start'] - 1)) if left['end'] + 1 != right['start']: continue #err = False # if protein['seq'][left['end'] - 1] != 'C' or form.seq[i_left - 1] != bpforms.protein_alphabet.monomers.C: # err = True # protein['modified_errors'].append('Disulfide bond site {}{} != C'.format( # protein['seq'][left['end'] - 1], left['end'])) # if protein['seq'][right['start'] - 1] != 'C' or form.seq[i_right - 1] != bpforms.protein_alphabet.monomers.C: # err = True # protein['modified_errors'].append('Disulfide bond site {}{} != C'.format( # protein['seq'][right['start'] - 1], right['start'])) # # if err: # continue concrete = False i_left = '{}-{}'.format( seq_len - (left['end'] - left['start'] + 1) + 1, seq_len) i_right = '{}-{}'.format( seq_len + 1, seq_len + (right['end'] - right['start'] + 1)) if apply_modifications: form.crosslinks.add( bpforms.Bond( #l_bond_atoms=[bpforms.Atom(bpforms.Monomer, 'S', position=11, monomer=i_left)], #r_bond_atoms=[bpforms.Atom(bpforms.Monomer, 'S', position=11, monomer=i_right)], #l_displaced_atoms=[bpforms.Atom(bpforms.Monomer, 'H', position=11, monomer=i_left)], #r_displaced_atoms=[bpforms.Atom(bpforms.Monomer, 'H', position=11, monomer=i_right)], comments= 'The polymer contains a disulfide bond between the ranges {} and {}' .format(i_left, i_right), )) # validate if apply_modifications: protein['modified_concrete'] = concrete protein['modified_errors'].extend(form.validate()) # return proteoform represented with BpForms return form
def post(self): ret = {} warnings = [] args = bcform_ns.payload # print(args) # get arguments form = args['form'] arg_subunits = args.get('subunits', None) # validate form try: bc_form = bcforms.core.BcForm().from_str(form) except Exception as error: flask_restplus.abort(400, 'Form is invalid', errors={'form': str(error)}) errors = bc_form.validate() if errors: flask_restplus.abort(400, 'Form is invalid', errors={'form': '. '.join(errors)}) # validate input subunit properties sum_length = 0 if arg_subunits is not None: for subunit in arg_subunits: # check if name is in the form subunit_id = subunit['name'] if subunit_id in [subunit.id for subunit in bc_form.subunits]: # check if encoding and structure are present at the same time if ('encoding' in subunit) and ('structure' in subunit): # if encoding and structure both present, check if encoding is known encoding = subunit['encoding'].strip() if encoding == 'bpforms.ProteinForm': try: subunit_structure = bpforms.ProteinForm( ).from_str(subunit['structure']) sum_length += len( subunit_structure ) * bc_form.get_subunit_attribute( subunit_id, 'stoichiometry') bc_form.set_subunit_attribute( subunit_id, 'structure', subunit_structure) except Exception as error: flask_restplus.abort( 400, 'Unable to parse bpforms.ProteinForm', errors={'structure': str(error)}) elif encoding == 'bpforms.DnaForm': try: subunit_structure = bpforms.DnaForm().from_str( subunit['structure']) sum_length += len( subunit_structure ) * bc_form.get_subunit_attribute( subunit_id, 'stoichiometry') bc_form.set_subunit_attribute( subunit_id, 'structure', subunit_structure) except Exception as error: flask_restplus.abort( 400, 'Unable to parse bpforms.DnaForm', errors={'structure': str(error)}) elif encoding == 'bpforms.RnaForm': try: subunit_structure = bpforms.RnaForm().from_str( subunit['structure']) sum_length += len( subunit_structure ) * bc_form.get_subunit_attribute( subunit_id, 'stoichiometry') bc_form.set_subunit_attribute( subunit_id, 'structure', subunit_structure) except Exception as error: flask_restplus.abort( 400, 'Unable to parse bpforms.RnaForm', errors={'structure': str(error)}) elif encoding == 'smiles' or encoding == 'SMILES' or encoding == 'smi' or encoding == 'SMI': try: bc_form.set_subunit_attribute( subunit_id, 'structure', subunit['structure']) except Exception as error: flask_restplus.abort( 400, 'Unable to parse SMILES string', errors={'structure': str(error)}) # else if one is present but not the other, report error elif ('encoding' in subunit) ^ ('structure' in subunit): flask_restplus.abort( 400, 'One of encoding and structure is present but not both' ) # when neither encoding nor structure is present else: # check formula if 'formula' in subunit: try: bc_form.set_subunit_attribute( subunit_id, 'formula', subunit['formula']) except Exception as error: flask_restplus.abort( 400, 'Unable to parse formula', errors={'formula': str(error)}) elif 'mol_wt' in subunit: try: bc_form.set_subunit_attribute( subunit_id, 'mol_wt', subunit['mol_wt']) except Exception as error: flask_restplus.abort( 400, 'Unable to parse mol_wt', errors={'mol_wt': str(error)}) # check charge if 'charge' in subunit: try: bc_form.set_subunit_attribute( subunit_id, 'charge', subunit['charge']) except Exception as error: flask_restplus.abort( 400, 'Unable to parse charge', errors={'charge': str(error)}) else: flask_restplus.abort(400, 'Subunit name not in BcForm', errors={'subunit': subunit_id}) ret['form'] = str(bc_form) if sum_length <= max_len_get_structure: try: ret['structure'] = bc_form.export() except Exception: pass else: warnings.append( 'The sum of length of bpforms-encoded subunits is {}, which exceeds the max length limit {}.' .format(sum_length, max_len_get_structure)) ret['structure'] = None try: ret['formula'] = str(bc_form.get_formula()) except Exception: pass try: ret['mol_wt'] = bc_form.get_mol_wt() except Exception: pass try: ret['charge'] = bc_form.get_charge() except Exception: pass if len(warnings) > 0: ret['warnings'] = ' '.join(warnings) return ret
# Import libraries import bcforms import bpforms # Create complexes from their string representations form_1 = bcforms.BcForm().from_str('2 * subunit_a + 3 * subunit_b') form_1.set_subunit_attribute('subunit_a', 'structure', bpforms.ProteinForm().from_str('CAAAAAAAA')) form_1.set_subunit_attribute('subunit_b', 'structure', bpforms.ProteinForm().from_str('AAAAAAAAC')) form_2 = bcforms.BcForm().from_str( '2 * subunit_a' '| x-link: [type: disulfide | l: subunit_a(1)-1 | r: subunit_a(2)-1]') form_2.set_subunit_attribute('subunit_a', 'structure', bpforms.ProteinForm().from_str('CAAAAAAAA')) # Create complexes programmatically form_1_b = bcforms.BcForm() form_1_b.subunits.append( bcforms.core.Subunit('subunit_a', 2, bpforms.ProteinForm().from_str('CAAAAAAAA'))) form_1_b.subunits.append( bcforms.core.Subunit('subunit_b', 3, bpforms.ProteinForm().from_str('AAAAAAAAC'))) form_2_b = bcforms.BcForm() subunit = bcforms.core.Subunit('subunit_a', 2, bpforms.ProteinForm().from_str('CAAAAAAAA')) form_2_b.subunits.append(subunit) form_2_b.crosslinks.append(