Example #1
0
def side_chain_placement(ag_to_place, current_reference_ag, rotamer_manager):
  """
  Works with poly_gly truncated hierarchy.
  Also used in fix_rama_outliers.
  """
  resname = current_reference_ag.resname.upper()
  c = one_three.get(resname, None)

  # seems to work with unusual residues...
  # if c is None:
  #   msg = "Only standard protein residues are currently supported.\n"
  #   msg += "The residue %s (chain %s, resid %s) chain is not standard." % (
  #       resname,
  #       current_reference_ag.parent().parent().id,
  #       current_reference_ag.parent().resid())
  #   raise Sorry(msg)
  ag_to_place.resname = three_one.get(c,resname)
  if c == 'G':
    return

  # align residue from ideal_res_dict to just placed ALA (ag_to_place)
  # or from pdb_hierarchy_template
  fixed_sites = flex.vec3_double()
  moving_sites = flex.vec3_double()
  reper_atoms = ["C","CA", "N"]
  for (ag, arr) in [(ag_to_place, fixed_sites),
                    (current_reference_ag, moving_sites)]:
    for a in ag.atoms():
      if a.name.strip() in reper_atoms:
        arr.append(a.xyz)
  assert len(fixed_sites) == 3
  if len(moving_sites) < 3:
    error_msg = "C, CA or N atoms are absent in secondary structure element." +\
        "\nPlease add them to the model and try again."
    raise Sorry(error_msg)
  assert len(moving_sites) == 3
  lsq_fit_obj = superpose.least_squares_fit(reference_sites = fixed_sites,
                                            other_sites = moving_sites)
  ideal_correct_ag = current_reference_ag.detached_copy()
  ideal_correct_ag.atoms().set_xyz(
      lsq_fit_obj.r.elems*ideal_correct_ag.atoms().extract_xyz()+\
      lsq_fit_obj.t.elems)
  ideal_correct_ag.atoms().set_xyz(
      rotamer_manager.nearest_rotamer_sites_cart(ideal_correct_ag))
  if len(ideal_correct_ag.atoms()) > 4:
    ag_to_place.pre_allocate_atoms(number_of_additional_atoms=\
                                                len(ideal_correct_ag.atoms())-4)
    for a in ideal_correct_ag.atoms():
      if a.name.strip() not in ["N","CA","C","O"]:
        at = a.detached_copy()
        at.uij_erase()
        ag_to_place.append_atom(atom=at)
  else:
    # This means something wrong with input model, e.g. only 3 atoms in
    # the residue and they happened to be N, CA, C
    pass
def correct_sequence(pdb_hierarchy,
                     sequences,
                     truncate_to_cbeta=False,
                     out=sys.stdout):
    """
  Modify the sequence for the pdb hierarchy to match that of the aligned
  sequence.  This will remove incompatible atoms; the sidechains will still
  need to be extended separated.  For proteins only - mismatches in nucleic
  acids will only result in a warning.

  :param pdb_hierarchy: iotbx.pdb.hierarchy.root object
  :param sequences: list of iotbx.bioinformatics.sequence objects
  :param trucate_to_cbeta: chop off entire sidechain to C-beta (default: leave
                           common atoms in place)
  :param out: output filehandle (default = stdout)
  :returns: number of atom_group objects renamed
  """
    from mmtbx.monomer_library import idealized_aa
    import mmtbx.validation.sequence
    from iotbx.pdb.amino_acid_codes import three_letter_given_one_letter
    seq_validation = mmtbx.validation.sequence.validation(
        pdb_hierarchy=pdb_hierarchy, sequences=sequences, log=out)
    for chain_seq in seq_validation.chains:
        if (chain_seq.chain_type == mmtbx.validation.sequence.NUCLEIC_ACID):
            if (len(chain_seq.mismatch) > 0):
                print("  WARNING: will skip %d mismatches in nucleic acid chain '%s'" % \
                  chain_seq.chain_id, file=out)
    res_dict = idealized_aa.residue_dict()
    expected_names = {}
    for resname in res_dict.keys():
        if (not "_h" in resname):
            ideal_res = res_dict[resname]
            expected_names[resname] = set([a.name for a in ideal_res.atoms()])
    n_changed = 0
    for chain in pdb_hierarchy.only_model().chains():
        if (not chain.is_protein()):
            continue
        for chain_seq in seq_validation.chains:
            if (chain.id
                    == chain_seq.chain_id) and (len(chain_seq.mismatch) > 0):
                for residue_group in chain.residue_groups():
                    resid = residue_group.resid()
                    if (resid in chain_seq.mismatch):
                        idx = chain_seq.mismatch.index(resid)
                        new_code = chain_seq.actual_code[idx]
                        new_resname = three_letter_given_one_letter.get(
                            new_code)
                        if (new_resname is not None):
                            expected_atoms = expected_names[
                                new_resname.lower()]
                            if (truncate_to_cbeta):
                                expected_atoms = expected_names["ala"]
                            for atom_group in residue_group.atom_groups():
                                n_changed += 1
                                n_removed = 0
                                atom_group.resname = new_resname
                                for atom in atom_group.atoms():
                                    if (not atom.name in expected_atoms):
                                        atom_group.remove_atom(atom)
                                        n_removed += 1
                            print("  chain '%s' %s %s --> %s (%d atoms removed)" % \
                              (chain.id, resid, residue_group.atom_groups()[0].resname,
                               new_resname, n_removed), file=out)
    pdb_hierarchy.atoms().reset_i_seq()
    return n_changed
Example #3
0
def get_aa_parent(code):
    one = modified_aa_names.lookup.get(code.upper(), False)
    if not one: return code
    return three_letter_given_one_letter.get(one, None)
Example #4
0
def exercise_pdb_hierarchy_sequence_as_cif_block():
    pdb_atom_site_loop_header = """\
data_mmcif
  loop_
_atom_site.group_PDB
_atom_site.id
_atom_site.type_symbol
_atom_site.label_atom_id
_atom_site.label_alt_id
_atom_site.label_comp_id
_atom_site.label_asym_id
_atom_site.label_entity_id
_atom_site.label_seq_id
_atom_site.pdbx_PDB_ins_code
_atom_site.Cartn_x
_atom_site.Cartn_y
_atom_site.Cartn_z
_atom_site.occupancy
_atom_site.B_iso_or_equiv
_atom_site.Cartn_x_esd
_atom_site.Cartn_y_esd
_atom_site.Cartn_z_esd
_atom_site.occupancy_esd
_atom_site.B_iso_or_equiv_esd
_atom_site.pdbx_formal_charge
_atom_site.auth_seq_id
_atom_site.auth_comp_id
_atom_site.auth_asym_id
_atom_site.auth_atom_id
_atom_site.pdbx_PDB_model_num
"""

    # simple example with multiple copies of chain
    input_4ehz = """\
ATOM   2    C CA  . GLU A 1 6   ? -35.647 65.380  -11.775 1.00 65.78  ? ? ? ? ? ? 858  GLU A CA  1
ATOM   11   C CA  . LYS A 1 7   ? -34.996 68.963  -10.712 1.00 89.52  ? ? ? ? ? ? 859  LYS A CA  1
ATOM   20   C CA  . LYS A 1 8   ? -31.415 68.325  -9.529  1.00 98.54  ? ? ? ? ? ? 860  LYS A CA  1
ATOM   29   C CA  . PRO A 1 9   ? -29.858 70.569  -6.813  1.00 103.45 ? ? ? ? ? ? 861  PRO A CA  1
ATOM   36   C CA  . ALA A 1 10  ? -26.545 72.463  -7.079  1.00 98.87  ? ? ? ? ? ? 862  ALA A CA  1
ATOM   41   C CA  . THR A 1 11  ? -23.410 70.412  -7.767  1.00 90.75  ? ? ? ? ? ? 863  THR A CA  1
ATOM   48   C CA  . GLU A 1 12  ? -21.306 71.534  -4.804  1.00 75.15  ? ? ? ? ? ? 864  GLU A CA  1
ATOM   57   C CA  . VAL A 1 13  ? -17.543 70.954  -4.809  1.00 49.52  ? ? ? ? ? ? 865  VAL A CA  1
ATOM   64   C CA  . ASP A 1 14  ? -16.048 68.671  -2.185  1.00 26.98  ? ? ? ? ? ? 866  ASP A CA  1
ATOM   72   C CA  . PRO A 1 15  ? -12.276 69.450  -2.061  1.00 27.34  ? ? ? ? ? ? 867  PRO A CA  1
ATOM   79   C CA  . THR A 1 16  ? -11.669 65.942  -0.699  1.00 23.73  ? ? ? ? ? ? 868  THR A CA  1
ATOM   86   C CA  . HIS A 1 17  ? -13.266 64.157  -3.671  1.00 23.80  ? ? ? ? ? ? 869  HIS A CA  1
ATOM   96   C CA  . PHE A 1 18  ? -10.664 63.252  -6.277  1.00 14.88  ? ? ? ? ? ? 870  PHE A CA  1
ATOM   107  C CA  . GLU A 1 19  ? -12.022 62.182  -9.666  1.00 23.47  ? ? ? ? ? ? 871  GLU A CA  1
ATOM   116  C CA  . LYS A 1 20  ? -10.351 59.111  -11.117 1.00 17.57  ? ? ? ? ? ? 872  LYS A CA  1
ATOM   125  C CA  . ARG A 1 21  ? -10.204 60.546  -14.661 1.00 19.09  ? ? ? ? ? ? 873  ARG A CA  1
ATOM   136  C CA  . PHE A 1 22  ? -7.912  63.384  -13.545 1.00 22.03  ? ? ? ? ? ? 874  PHE A CA  1
ATOM   147  C CA  . LEU A 1 23  ? -5.613  61.332  -11.271 1.00 18.20  ? ? ? ? ? ? 875  LEU A CA  1
ATOM   155  C CA  . LYS A 1 24  ? -2.583  60.745  -13.513 1.00 26.05  ? ? ? ? ? ? 876  LYS A CA  1
ATOM   2365 C CA  . VAL B 1 13  ? 38.084  -8.470  -5.157  1.00 57.98  ? ? ? ? ? ? 865  VAL B CA  1
ATOM   2372 C CA  . ASP B 1 14  ? 36.468  -6.229  -2.536  1.00 51.96  ? ? ? ? ? ? 866  ASP B CA  1
ATOM   2380 C CA  . PRO B 1 15  ? 32.749  -7.130  -2.340  1.00 48.96  ? ? ? ? ? ? 867  PRO B CA  1
ATOM   2387 C CA  . THR B 1 16  ? 31.935  -3.705  -0.847  1.00 26.72  ? ? ? ? ? ? 868  THR B CA  1
ATOM   2394 C CA  . HIS B 1 17  ? 33.519  -1.814  -3.754  1.00 33.15  ? ? ? ? ? ? 869  HIS B CA  1
ATOM   2404 C CA  . PHE B 1 18  ? 31.094  -0.811  -6.488  1.00 26.55  ? ? ? ? ? ? 870  PHE B CA  1
ATOM   2415 C CA  . GLU B 1 19  ? 32.359  0.467   -9.861  1.00 38.45  ? ? ? ? ? ? 871  GLU B CA  1
ATOM   2424 C CA  . LYS B 1 20  ? 30.409  3.510   -11.036 1.00 33.69  ? ? ? ? ? ? 872  LYS B CA  1
ATOM   2433 C CA  . ARG B 1 21  ? 30.400  2.430   -14.663 1.00 36.58  ? ? ? ? ? ? 873  ARG B CA  1
ATOM   2444 C CA  . PHE B 1 22  ? 28.294  -0.647  -13.791 1.00 38.39  ? ? ? ? ? ? 874  PHE B CA  1
ATOM   2455 C CA  . LEU B 1 23  ? 25.763  1.275   -11.703 1.00 32.87  ? ? ? ? ? ? 875  LEU B CA  1
ATOM   2463 C CA  . LYS B 1 24  ? 22.588  1.723   -13.713 1.00 30.22  ? ? ? ? ? ? 876  LYS B CA  1
"""
    import iotbx.bioinformatics
    from iotbx.pdb.amino_acid_codes import three_letter_given_one_letter
    from cctbx.array_family import flex
    sequence_4ehz = iotbx.bioinformatics.sequence(
        "GDIVSEKKPATEVDPTHFEKRFLK")  #RIRDLGEGHF"
    pdb_in = iotbx.pdb.input(lines=(pdb_atom_site_loop_header +
                                    input_4ehz).splitlines(),
                             source_info=None)
    model = mmtbx.model.manager(pdb_in)
    model.set_sequences([sequence_4ehz])
    cif_block = model._sequence_validation.sequence_as_cif_block()
    sequence = ';' + sequence_4ehz.sequence + '\n;'
    assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == sequence
    assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][
        0] == sequence
    assert cif_block['_entity_poly.pdbx_strand_id'] == 'A,B'
    assert approx_equal(flex.int(cif_block['_entity_poly_seq.num']),
                        list(range(1, 25)))
    assert cif_block['_entity_poly_seq.entity_id'].all_eq('1')
    assert list(cif_block['_entity_poly_seq.mon_id']) == [
        three_letter_given_one_letter.get(i) for i in sequence_4ehz.sequence
    ]
    #
    # example with modified amino acid - PTR
    input_3zdi = """\
ATOM   1422 C  CA  . ASN A 1 179 ? -11.025 -26.833 -3.747  1.00 86.68  ? ? ? ? ? ? 213  ASN A CA  1
ATOM   1430 C  CA  . VAL A 1 180 ? -7.831  -26.493 -1.696  1.00 82.40  ? ? ? ? ? ? 214  VAL A CA  1
ATOM   1437 C  CA  . SER A 1 181 ? -8.142  -28.602 1.444   1.00 89.69  ? ? ? ? ? ? 215  SER A CA  1
ATOM   1443 C  CA  . PTR A 1 182 ? -5.406  -26.622 3.177   1.00 88.05  ? ? ? ? ? ? 216  PTR A CA  1
ATOM   1459 C  CA  . ILE A 1 183 ? -7.514  -23.621 4.117   1.00 83.90  ? ? ? ? ? ? 217  ILE A CA  1
ATOM   1467 C  CA  . CYS A 1 184 ? -8.907  -21.533 7.009   1.00 86.39  ? ? ? ? ? ? 218  CYS A CA  1
ATOM   1473 C  CA  . SER A 1 185 ? -6.795  -21.356 10.148  1.00 91.03  ? ? ? ? ? ? 219  SER A CA  1
"""
    sequence_3zdi = iotbx.bioinformatics.sequence("NVSYICSR")
    pdb_in = iotbx.pdb.input(lines=(pdb_atom_site_loop_header +
                                    input_3zdi).splitlines(),
                             source_info=None)
    model = mmtbx.model.manager(pdb_in)
    model.set_sequences([sequence_3zdi])
    cif_block = model._sequence_validation.sequence_as_cif_block()
    assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == \
      ';NVS(PTR)ICSR\n;'
    assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == \
      ';' + sequence_3zdi.sequence + '\n;'
    assert approx_equal(flex.int(cif_block['_entity_poly_seq.num']),
                        list(range(1, 9)))
    assert list(cif_block['_entity_poly_seq.mon_id']) == [
        'ASN', 'VAL', 'SER', 'PTR', 'ILE', 'CYS', 'SER', 'ARG'
    ]
    #
    input_4gln = """\
ATOM   2    C CA  . DTH A 1 1   ? -2.916  5.861  2.629   1.00 16.39 ? ? ? ? ? ? 1   DTH D CA  1
ATOM   9    C CA  . DTY A 1 2   ? 0.533   4.844  3.866   1.00 10.74 ? ? ? ? ? ? 2   DTY D CA  1
ATOM   21   C CA  . DLY A 1 3   ? 3.161   3.111  1.736   1.00 8.24  ? ? ? ? ? ? 3   DLY D CA  1
ATOM   30   C CA  . DLE A 1 4   ? 6.958   3.293  1.625   1.00 7.95  ? ? ? ? ? ? 4   DLE D CA  1
ATOM   38   C CA  . DIL A 1 5   ? 9.053   0.443  0.257   1.00 8.44  ? ? ? ? ? ? 5   DIL D CA  1
ATOM   46   C CA  . DLE A 1 6   ? 12.622  1.402  -0.674  1.00 8.62  ? ? ? ? ? ? 6   DLE D CA  1
ATOM   54   C CA  A DSG A 1 7   ? 14.930  -1.609 -0.756  0.60 11.27 ? ? ? ? ? ? 7   DSG D CA  1
ATOM   55   C CA  B DSG A 1 7   ? 14.934  -1.617 -0.732  0.40 11.77 ? ? ? ? ? ? 7   DSG D CA  1
ATOM   67   C CA  . GLY A 1 8   ? 18.113  -0.249 -2.284  1.00 13.02 ? ? ? ? ? ? 8   GLY D CA  1
ATOM   71   C CA  . DLY A 1 9   ? 21.326  -1.954 -3.288  1.00 17.83 ? ? ? ? ? ? 9   DLY D CA  1
ATOM   80   C CA  . DTH A 1 10  ? 20.765  -0.934 -6.926  1.00 16.38 ? ? ? ? ? ? 10  DTH D CA  1
#
ATOM   472  C CA  . GLU B 2 6   ? 15.798  -6.874 23.843  1.00 31.74 ? ? ? ? ? ? 6   GLU E CA  1
ATOM   477  C CA  . VAL B 2 7   ? 16.644  -3.926 21.599  1.00 15.99 ? ? ? ? ? ? 7   VAL E CA  1
ATOM   484  C CA  . VAL B 2 8   ? 13.767  -1.465 21.234  1.00 10.37 ? ? ? ? ? ? 8   VAL E CA  1
ATOM   491  C CA  . LYS B 2 9   ? 12.953  -1.088 17.521  1.00 8.44  ? ? ? ? ? ? 9   LYS E CA  1
#
HETATM 2537 O O   . HOH E 3 .   ? 8.196   -3.708 8.277   1.00 15.02 ? ? ? ? ? ? 101 HOH D O   1
HETATM 2538 O O   . HOH E 3 .   ? 4.901   -4.298 5.515   1.00 13.08 ? ? ? ? ? ? 102 HOH D O   1
HETATM 2663 O O   . HOH F 3 .   ? 10.535  -2.721 20.049  1.00 15.44 ? ? ? ? ? ? 201 HOH E O   1
HETATM 2664 O O   . HOH F 3 .   ? 0.790   8.695  30.909  1.00 17.06 ? ? ? ? ? ? 202 HOH E O   1
HETATM 2795 O O   . HOH G 3 .   ? 11.265  2.914  43.878  1.00 13.92 ? ? ? ? ? ? 201 HOH F O   1
HETATM 2796 O O   . HOH G 3 .   ? 11.197  11.667 36.108  1.00 17.00 ? ? ? ? ? ? 202 HOH F O   1
"""
    sequence_4gln = [
        iotbx.bioinformatics.sequence("TYKLILNGKT"),
        iotbx.bioinformatics.sequence("GQNHHEVVK")
    ]
    pdb_in = iotbx.pdb.input(lines=(pdb_atom_site_loop_header +
                                    input_4gln).splitlines(),
                             source_info=None)
    model = mmtbx.model.manager(pdb_in)
    model.set_sequences(sequence_4gln)
    cif_block = model._sequence_validation.sequence_as_cif_block()
    assert list(cif_block['_entity.id']) == ['1', '2']
    assert approx_equal(flex.int(cif_block['_entity_poly_seq.num']),
                        list(range(1, 11)) + list(range(1, 10)))
    assert list(cif_block['_entity_poly_seq.mon_id']) == [
        'DTH', 'DTY', 'DLY', 'DLE', 'DIL', 'DLE', 'DSG', 'GLY', 'DLY', 'DTH',
        'GLY', 'GLN', 'ASN', 'HIS', 'HIS', 'GLU', 'VAL', 'VAL', 'LYS'
    ]
    assert list(cif_block['_entity_poly.pdbx_seq_one_letter_code']) == [
        ';(DTH)(DTY)(DLY)(DLE)(DIL)(DLE)(DSG)G(DLY)(DTH)\n;',
        ';' + sequence_4gln[1].sequence + '\n;'
    ]
    assert list(cif_block['_entity_poly.pdbx_seq_one_letter_code_can']) == [
        ';' + sequence_4gln[0].sequence + '\n;',
        ';' + sequence_4gln[1].sequence + '\n;'
    ]
    #
    input_1ezu = """\
ATOM   3971 C  CA  . VAL D 2 16  ? 24.971  -4.493  -3.652  1.00 33.12  ? ? ? ? ? ? 731 VAL D CA  1
ATOM   3978 C  CA  . SER D 2 17  ? 27.194  -3.056  -0.946  1.00 35.47  ? ? ? ? ? ? 732 SER D CA  1
ATOM   3984 C  CA  . LEU D 2 18  ? 26.541  0.123   0.961   1.00 45.29  ? ? ? ? ? ? 733 LEU D CA  1
ATOM   3992 C  CA  . ASN D 2 19  ? 29.777  2.032   1.598   1.00 53.09  ? ? ? ? ? ? 734 ASN D CA  1
ATOM   4000 C  CA  . SER D 2 20  ? 30.737  4.963   3.775   1.00 61.92  ? ? ? ? ? ? 737 SER D CA  1
ATOM   4006 C  CA  . GLY D 2 21  ? 34.478  4.622   4.207   1.00 62.21  ? ? ? ? ? ? 738 GLY D CA  1
ATOM   4010 C  CA  . TYR D 2 22  ? 33.903  0.885   4.483   1.00 54.81  ? ? ? ? ? ? 739 TYR D CA  1
"""
    sequence_1ezu = iotbx.bioinformatics.sequence('VSLNSGY')
    pdb_in = iotbx.pdb.input(lines=(pdb_atom_site_loop_header +
                                    input_1ezu).splitlines(),
                             source_info=None)
    model = mmtbx.model.manager(pdb_in)
    model.set_sequences([sequence_1ezu])
    cif_block = model._sequence_validation.sequence_as_cif_block()
    assert list(cif_block['_entity_poly_seq.mon_id']) == [
        'VAL', 'SER', 'LEU', 'ASN', 'SER', 'GLY', 'TYR'
    ]
    assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == \
      ';' + sequence_1ezu.sequence + '\n;'
    assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == \
      ';' + sequence_1ezu.sequence + '\n;'

    input_2hok = """\
ATOM   301  P  P     . C   A 1 15 ? 15.802 44.045 80.094 1.00 59.36 ? ? ? ? ? ? 23  C   A P     1
ATOM   321  P  P     . C   A 1 16 ? 12.286 47.301 82.617 1.00 68.27 ? ? ? ? ? ? 24  C   A P     1
ATOM   341  P  P     . U   A 1 17 ? 6.815  51.648 82.739 1.00 78.03 ? ? ? ? ? ? 25  U   A P     1
ATOM   361  P  P     . G   A 1 21 ? 7.042  52.289 91.645 1.00 96.25 ? ? ? ? ? ? 29  G   A P     1
ATOM   384  P  P     . C   A 1 22 ? 7.024  46.751 90.841 1.00 84.69 ? ? ? ? ? ? 30  C   A P     1
ATOM   404  P  P     . G   A 1 23 ? 7.477  40.933 88.377 1.00 81.65 ? ? ? ? ? ? 31  G   A P     1
"""
    sequence_2hok = iotbx.bioinformatics.sequence("CCUUCUGCG")
    pdb_in = iotbx.pdb.input(lines=(pdb_atom_site_loop_header +
                                    input_2hok).splitlines(),
                             source_info=None)
    model = mmtbx.model.manager(pdb_in)
    model.set_sequences([sequence_2hok])
    cif_block = model._sequence_validation.sequence_as_cif_block()
    assert list(cif_block['_entity_poly_seq.mon_id']) == [
        'C', 'C', 'U', 'U', 'C', 'U', 'G', 'C', 'G'
    ]
    assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == \
      ';' + sequence_2hok.sequence + '\n;'
    assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == \
      ';' + sequence_2hok.sequence + '\n;'
    #
    input_3tpy = """\
ATOM      2  CA  GLN A  24       2.586  40.220  34.036  1.00 41.54           C
ATOM      8  CA  LYS A  25       1.265  43.698  34.904  1.00 25.47           C
ATOM     17  CA  GLN A  26       3.834  45.984  36.538  1.00 22.91           C
ATOM     26  CA  PRO A  27       2.835  48.614  39.135  1.00 19.20           C
ATOM     33  CA  ILE A  28       3.972  52.206  39.293  1.00 18.70           C
ATOM     41  CA  SER A  29       6.403  51.332  42.097  1.00 22.63           C
TER
HETATM  852 MG    MG A 999     -12.415  61.451  32.421  0.70 28.10          MG
HETATM  853  C   TRS A 153      -0.078  70.151  24.773  0.33 24.86           C
HETATM  877  PA BDUP A 777      -9.339  60.563  31.137  0.70 19.64           P
HETATM  881  PB BDUP A 777     -11.768  59.969  29.491  0.70 27.76           P
HETATM  885  PG BDUP A 777     -13.098  58.529  31.620  0.70 33.91           P
HETATM  905  P  AUMP A 154      -9.010  60.358  31.334  0.30 11.42           P
HETATM  909  O   HOH A 155      -0.197  60.723  27.343  1.00 17.17           O
HETATM  910  O   HOH A 156     -10.293  62.567  35.648  1.00 19.43           O
"""
    sequence_3tpy = iotbx.bioinformatics.sequence("QKQPIS")
    pdb_in = iotbx.pdb.input(lines=(input_3tpy).splitlines(), source_info=None)
    model = mmtbx.model.manager(pdb_in)
    model.set_sequences([sequence_3tpy])
    cif_block = model.get_hierarchy().as_cif_block()
    assert list(cif_block["_atom_site.label_seq_id"]) == [
        '1', '2', '3', '4', '5', '6', '.', '.', '.', '.', '.', '.', '.', '.'
    ]
    #
    input_3tgr = """\
ATOM   2449  CA  GLY A 459     -17.536  10.137  41.979  1.00181.52           C
ATOM   2453  CA  GLN A 460     -15.862  12.780  44.128  1.00192.51           C
ATOM   2462  CA  ASN A 463     -19.198   8.054  50.455  1.00180.96           C
ATOM   2470  CA  ASP A 464     -19.235   4.661  52.197  1.00143.07           C
ATOM   2478  CA  THR A 465     -20.893   2.988  49.198  1.00 91.96           C
"""
    sequence_3tgr = iotbx.bioinformatics.sequence("DGGQSNETNDTET")
    pdb_in = iotbx.pdb.input(lines=(input_3tgr).splitlines(), source_info=None)
    model = mmtbx.model.manager(pdb_in)
    model.set_sequences([sequence_3tgr])
    cif_block = model._sequence_validation.sequence_as_cif_block()
    assert cif_block["_entity_poly.pdbx_seq_one_letter_code"][0] == \
      ';DGGQSNETNDNET\n;'
    input_2im9 = """\
ATOM   2423  CA  PRO A 345       2.114  16.158   0.161  1.00 29.14           C
ATOM   2430  CA  VAL A 346      -1.223  17.837   0.938  1.00 31.05           C
ATOM   2437  CA  CYS A 349      -4.081  15.852   7.014  0.50 28.57           C
ATOM   2443  CA  GLN A 350      -6.176  14.041   9.639  0.50 30.62           C
ATOM   2452  CA  LEU A 351      -6.631  10.729   7.797  0.50 31.53           C
ATOM   2460  CA  PHE A 352      -5.220   9.172   4.620  0.50 31.95           C
"""
    sequence_2im9 = iotbx.bioinformatics.sequence(
        "SSPTIKGINIQVVLPEKPVSNGCQLFDIR")
    pdb_in = iotbx.pdb.input(lines=(input_2im9).splitlines(), source_info=None)
    model = mmtbx.model.manager(pdb_in)
    model.set_sequences([sequence_2im9])
    cif_block = model._sequence_validation.sequence_as_cif_block()
    assert list(cif_block["_entity_poly_seq.mon_id"]) == [
        'SER', 'SER', 'PRO', 'THR', 'ILE', 'LYS', 'GLY', 'ILE', 'ASN', 'ILE',
        'GLN', 'VAL', 'VAL', 'LEU', 'PRO', 'GLU', 'LYS', 'PRO', 'VAL', 'SER',
        'ASN', 'GLY', 'CYS', 'CYS', 'GLN', 'LEU', 'ASP', 'ILE', 'ARG'
    ]
Example #5
0
  def sequence_as_cif_block(self, custom_residues=None):
    """
    Export sequence information as mmCIF block
    Version 5.0 of mmCIF/PDBx dictionary

    Parameters
    ----------
    custom_residues: list of str
      List of custom 3-letter residues to keep in pdbx_one_letter_sequence
      The 3-letter residue must exist in the model. If None, the value
      from self.custom_residues is used.

    Returns
    -------
    cif_block: iotbx.cif.model.block
    """

    if custom_residues is None:
      custom_residues = self.custom_residues

    dna = set(['DA', 'DT', 'DC', 'DG', 'DI'])
    rna = set(['A', 'U', 'C', 'G'])
    rna_to_dna = {'A': 'DA', 'U': 'DT', 'T': 'DT', 'C': 'DC', 'G': 'DG',
                  'I': 'DI'}
    modified_dna = set()
    modified_rna = set()
    for key in modified_rna_dna_names.lookup.keys():
      value = modified_rna_dna_names.lookup[key]
      if value in dna:
        modified_dna.add(key)
      elif value in rna:
        modified_rna.add(key)

    # http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/entity.html
    entity_loop = iotbx.cif.model.loop(header=(
      '_entity.id',
      '_entity.pdbx_description'
    ))

    # http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/entity_poly.html
    entity_poly_loop = iotbx.cif.model.loop(header=(
      '_entity_poly.entity_id',
      '_entity_poly.nstd_linkage',
      '_entity_poly.nstd_monomer',
      '_entity_poly.pdbx_seq_one_letter_code',
      '_entity_poly.pdbx_seq_one_letter_code_can',
      '_entity_poly.pdbx_strand_id',
      '_entity_poly.pdbx_target_identifier',
      '_entity_poly.type',
    ))

    # http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/entity_poly_seq.html
    entity_poly_seq_loop = iotbx.cif.model.loop(header=(
      '_entity_poly_seq.entity_id',
      '_entity_poly_seq.num',
      '_entity_poly_seq.mon_id',
      '_entity_poly_seq.hetero',
    ))

    # http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/struct_ref.html
    struct_ref_loop = iotbx.cif.model.loop(header=(
      '_struct_ref.id',
      '_struct_ref.db_code',
      '_struct_ref.db_name',
      '_struct_ref.entity_id',
      '_struct_ref.pdbx_align_begin',
      '_struct_ref.pdbx_db_accession',
      '_struct_ref.pdbx_db_isoform',
      '_struct_ref.pdbx_seq_one_letter_code',
    ))

    # http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/struct_ref_seq.html
    struct_ref_seq_loop = iotbx.cif.model.loop(header=(
      '_struct_ref_seq.align_id',
      '_struct_ref_seq.db_align_beg',
      '_struct_ref_seq.db_align_end',
      '_struct_ref_seq.pdbx_PDB_id_code',
      '_struct_ref_seq.pdbx_auth_seq_align_beg',
      '_struct_ref_seq.pdbx_auth_seq_align_end',
      '_struct_ref_seq.pdbx_db_accession',
      '_struct_ref_seq.pdbx_db_align_beg_ins_code',
      '_struct_ref_seq.pdbx_db_align_end_ins_code',
      '_struct_ref_seq.pdbx_seq_align_beg_ins_code',
      '_struct_ref_seq.pdbx_seq_align_end_ins_code',
      '_struct_ref_seq.pdbx_strand_id',
      '_struct_ref_seq.ref_id',
      '_struct_ref_seq.seq_align_beg',
      '_struct_ref_seq.seq_align_end',
    ))

    entity_id = 0
    # entity_poly
    sequence_to_entity_id = dict()
    nstd_linkage = dict()
    nstd_monomer = dict()
    seq_one_letter_code = dict()
    seq_one_letter_code_can = dict()
    strand_id = dict()
    target_identifier = dict()
    sequence_type = dict()
    # entity_poly_seq
    num = dict()
    mon_id = dict()
    hetero = dict()
    # struct_ref (work in progress)
    chain_id = dict()
    db_code = '?'
    db_name = '?'
    align_begin = '?'
    db_accession = '?'
    db_isoform = '?'
    # struct_ref_seq (work in progress)
    db_align_beg = '?'
    db_align_end = '?'
    PDB_id_code = '?'
    align_beg_ins_code = '?'
    align_end_ins_code = '?'

    for i_chain, chain in enumerate(self.chains):
      seq_can = chain.alignment.b
      # entity_id
      if seq_can not in sequence_to_entity_id:
        entity_id += 1
        sequence_to_entity_id[seq_can] = entity_id
      else:
        # subsequent matches just add strand_id
        entity_id = sequence_to_entity_id[seq_can]
        strand_id[entity_id].append(chain.chain_id)
        continue

      # entity_poly items
      # nstd_linkage (work in progress)
      if entity_id not in nstd_linkage:
        nstd_linkage[entity_id] = 'no'
      # nstd_monomer
      if entity_id not in nstd_monomer:
        nstd_monomer[entity_id] = 'no'
      # pdbx_seq_one_letter_code
      if entity_id not in seq_one_letter_code:
        seq_one_letter_code[entity_id] = list()
      # type (work in progress)
      if entity_id not in sequence_type:
        sequence_type[entity_id] = '?'
      has_protein = False
      has_rna = False
      has_dna = False
      has_sugar = False
      is_d = False
      # chain.alignment.a is the model
      # chain.alignment.b is the sequence
      for i_a, i_b in zip(chain.alignment.i_seqs_a, chain.alignment.i_seqs_b):
        # sequence does not have residue in model
        if i_b is None:
          continue
        # model does not have residue in sequence
        if i_a is None or chain.resnames[i_a] is None:
          letter = seq_can[i_b]
        else:
          resname = chain.resnames[i_a].strip()
          # check for modified residues
          if (resname in modified_aa_names.lookup or
              resname in modified_rna_dna_names.lookup or
              resname in custom_residues):
            letter = '({resname})'.format(resname=resname)
            nstd_monomer[entity_id] = 'yes'
          elif resname in three_letter_l_given_three_letter_d:
            letter = '({resname})'.format(resname=resname)
            nstd_monomer[entity_id] = 'yes'
          # check for nucleic acid
          elif resname in dna:
            letter = '({resname})'.format(resname=resname)
          elif resname in rna:
            letter = resname
          # regular protein
          else:
            letter = one_letter_given_three_letter.get(resname)
            if letter is None:
              letter = 'X'

          # check for protein
          if (resname in one_letter_given_three_letter or
              resname in modified_aa_names.lookup):
            has_protein = True
          # check for DNA
          # hybrid protein/DNA/RNA chains are not allowed
          if resname in dna or resname in modified_dna:
            has_dna = True
            has_protein = False
          # check for RNA
          # does not handle hybrid DNA/RNA chains
          if resname in rna or resname in modified_rna:
            has_rna = True
            has_dna = False
            has_protein = False
          # check chirality
          # hybrid D/L handed chains are not allowed
          if resname in three_letter_l_given_three_letter_d:
            is_d = True
        # pdbx_seq_one_letter_code
        seq_one_letter_code[entity_id].append(letter)

      # pdbx_seq_one_letter_code_can
      seq_one_letter_code_can[entity_id] = seq_can.replace('-', '')
      # strand_id
      if entity_id not in strand_id:
        strand_id[entity_id] = list()
      strand_id[entity_id].append(chain.chain_id)
      # target_identifier (work in progress)
      if entity_id not in target_identifier:
        target_identifier[entity_id] = '?'
      # type
      #   polypeptide(L)
      #   polypeptide(D)
      #   polydeoxyribonucleotide,
      #   polyribonucleotide
      # missing
      #   cyclic-psuedo-peptide
      #   other
      #   peptide nucleic acid
      #   polydeoxyribonucleotide/polyribonucleotide
      #   polysaccharide(D)
      #   polysaccahride(L)
      if has_protein:
        choice = 'polypeptide'
        if is_d:
          choice += '(D)'
        else:
          choice += '(L)'
      if has_dna:
        choice = 'polydeoxyribonucleotide'
      if has_rna:
        choice = 'polyribonucleotide'
      sequence_type[entity_id] = choice

      # entity_poly_seq items
      if entity_id not in mon_id:
        mon_id[entity_id] = list()
      if entity_id not in num:
        num[entity_id] = list()
      if entity_id not in hetero:
        hetero[entity_id] = list()

      # struct_ref items
      if entity_id not in chain_id:
        chain_id[entity_id] = i_chain + 1

      for i_a, i_b in zip(chain.alignment.i_seqs_a, chain.alignment.i_seqs_b):
        # sequence does not have residue in model
        if i_b is None:
          continue
        seq_resname = None
        if has_protein:
          seq_resname = three_letter_given_one_letter.get(seq_can[i_b])
        if has_dna:
          seq_resname = rna_to_dna.get(seq_can[i_b])
        if has_rna:
          seq_resname = seq_can[i_b]
        if seq_resname is None:
          seq_resname = 'UNK'
        # model does not have residue in sequence
        if i_a is None or chain.resnames[i_a] is None:
          resname = seq_resname
        else:
          resname = chain.resnames[i_a]
        mon_id[entity_id].append(resname.strip())
        if len(num[entity_id]) == 0:
          num[entity_id].append(1)
        else:
          num[entity_id].append(num[entity_id][-1] + 1)
        hetero[entity_id].append('no')

    # build loops
    ids = list(sequence_to_entity_id.values())
    ids.sort()
    align_id = 1
    for entity_id in ids:
      # construct entity_poly loop
      if len(strand_id[entity_id]) == 1:
        chains = strand_id[entity_id][0]
      else:
        chains = strand_id[entity_id]
        #chains.sort()
        chains = ','.join(chains)
      entity_poly_loop.add_row((
        entity_id,
        nstd_linkage[entity_id],
        nstd_monomer[entity_id],
        ';' + ''.join(seq_one_letter_code[entity_id]) + '\n;',
        ';' + seq_one_letter_code_can[entity_id] + '\n;',
        chains,
        target_identifier[entity_id],
        sequence_type[entity_id]
      ))

      # construct entity loop
      entity_loop.add_row((
        entity_id,
        'Chains: ' + chains
      ))

      # construct entity_poly_seq loop
      chain_length = len(mon_id[entity_id])
      for i in range(chain_length):
        entity_poly_seq_loop.add_row((
          entity_id,
          num[entity_id][i],
          mon_id[entity_id][i],
          hetero[entity_id][i]
        ))

      # construct struct_ref loop
      struct_ref_loop.add_row((
        chain_id[entity_id],
        db_code,
        db_name,
        entity_id,
        align_begin,
        db_accession,
        db_isoform,
        ';' + seq_one_letter_code_can[entity_id] + '\n;'
      ))

      # construct struct_ref_seq loop
      for chain in strand_id[entity_id]:
        struct_ref_seq_loop.add_row((
          align_id,
          db_align_beg,
          db_align_end,
          PDB_id_code,
          '1',
          len(seq_one_letter_code_can[entity_id]) - 1,
          db_accession,
          align_beg_ins_code,
          align_end_ins_code,
          align_beg_ins_code,
          align_end_ins_code,
          chain,
          chain_id[entity_id],
          '1',
          len(seq_one_letter_code_can[entity_id]) - 1
        ))

    # construct block
    cif_block = iotbx.cif.model.block()
    cif_block.add_loop(entity_loop)
    cif_block.add_loop(entity_poly_loop)
    cif_block.add_loop(entity_poly_seq_loop)
    cif_block.add_loop(struct_ref_loop)
    cif_block.add_loop(struct_ref_seq_loop)

    return cif_block
def exercise_pdb_hierarchy_sequence_as_cif_block():
  pdb_atom_site_loop_header = """\
data_mmcif
  loop_
_atom_site.group_PDB
_atom_site.id
_atom_site.type_symbol
_atom_site.label_atom_id
_atom_site.label_alt_id
_atom_site.label_comp_id
_atom_site.label_asym_id
_atom_site.label_entity_id
_atom_site.label_seq_id
_atom_site.pdbx_PDB_ins_code
_atom_site.Cartn_x
_atom_site.Cartn_y
_atom_site.Cartn_z
_atom_site.occupancy
_atom_site.B_iso_or_equiv
_atom_site.Cartn_x_esd
_atom_site.Cartn_y_esd
_atom_site.Cartn_z_esd
_atom_site.occupancy_esd
_atom_site.B_iso_or_equiv_esd
_atom_site.pdbx_formal_charge
_atom_site.auth_seq_id
_atom_site.auth_comp_id
_atom_site.auth_asym_id
_atom_site.auth_atom_id
_atom_site.pdbx_PDB_model_num
"""

  # simple example with multiple copies of chain
  input_4ehz = """\
ATOM   2    C CA  . GLU A 1 6   ? -35.647 65.380  -11.775 1.00 65.78  ? ? ? ? ? ? 858  GLU A CA  1
ATOM   11   C CA  . LYS A 1 7   ? -34.996 68.963  -10.712 1.00 89.52  ? ? ? ? ? ? 859  LYS A CA  1
ATOM   20   C CA  . LYS A 1 8   ? -31.415 68.325  -9.529  1.00 98.54  ? ? ? ? ? ? 860  LYS A CA  1
ATOM   29   C CA  . PRO A 1 9   ? -29.858 70.569  -6.813  1.00 103.45 ? ? ? ? ? ? 861  PRO A CA  1
ATOM   36   C CA  . ALA A 1 10  ? -26.545 72.463  -7.079  1.00 98.87  ? ? ? ? ? ? 862  ALA A CA  1
ATOM   41   C CA  . THR A 1 11  ? -23.410 70.412  -7.767  1.00 90.75  ? ? ? ? ? ? 863  THR A CA  1
ATOM   48   C CA  . GLU A 1 12  ? -21.306 71.534  -4.804  1.00 75.15  ? ? ? ? ? ? 864  GLU A CA  1
ATOM   57   C CA  . VAL A 1 13  ? -17.543 70.954  -4.809  1.00 49.52  ? ? ? ? ? ? 865  VAL A CA  1
ATOM   64   C CA  . ASP A 1 14  ? -16.048 68.671  -2.185  1.00 26.98  ? ? ? ? ? ? 866  ASP A CA  1
ATOM   72   C CA  . PRO A 1 15  ? -12.276 69.450  -2.061  1.00 27.34  ? ? ? ? ? ? 867  PRO A CA  1
ATOM   79   C CA  . THR A 1 16  ? -11.669 65.942  -0.699  1.00 23.73  ? ? ? ? ? ? 868  THR A CA  1
ATOM   86   C CA  . HIS A 1 17  ? -13.266 64.157  -3.671  1.00 23.80  ? ? ? ? ? ? 869  HIS A CA  1
ATOM   96   C CA  . PHE A 1 18  ? -10.664 63.252  -6.277  1.00 14.88  ? ? ? ? ? ? 870  PHE A CA  1
ATOM   107  C CA  . GLU A 1 19  ? -12.022 62.182  -9.666  1.00 23.47  ? ? ? ? ? ? 871  GLU A CA  1
ATOM   116  C CA  . LYS A 1 20  ? -10.351 59.111  -11.117 1.00 17.57  ? ? ? ? ? ? 872  LYS A CA  1
ATOM   125  C CA  . ARG A 1 21  ? -10.204 60.546  -14.661 1.00 19.09  ? ? ? ? ? ? 873  ARG A CA  1
ATOM   136  C CA  . PHE A 1 22  ? -7.912  63.384  -13.545 1.00 22.03  ? ? ? ? ? ? 874  PHE A CA  1
ATOM   147  C CA  . LEU A 1 23  ? -5.613  61.332  -11.271 1.00 18.20  ? ? ? ? ? ? 875  LEU A CA  1
ATOM   155  C CA  . LYS A 1 24  ? -2.583  60.745  -13.513 1.00 26.05  ? ? ? ? ? ? 876  LYS A CA  1
ATOM   2365 C CA  . VAL B 1 13  ? 38.084  -8.470  -5.157  1.00 57.98  ? ? ? ? ? ? 865  VAL B CA  1
ATOM   2372 C CA  . ASP B 1 14  ? 36.468  -6.229  -2.536  1.00 51.96  ? ? ? ? ? ? 866  ASP B CA  1
ATOM   2380 C CA  . PRO B 1 15  ? 32.749  -7.130  -2.340  1.00 48.96  ? ? ? ? ? ? 867  PRO B CA  1
ATOM   2387 C CA  . THR B 1 16  ? 31.935  -3.705  -0.847  1.00 26.72  ? ? ? ? ? ? 868  THR B CA  1
ATOM   2394 C CA  . HIS B 1 17  ? 33.519  -1.814  -3.754  1.00 33.15  ? ? ? ? ? ? 869  HIS B CA  1
ATOM   2404 C CA  . PHE B 1 18  ? 31.094  -0.811  -6.488  1.00 26.55  ? ? ? ? ? ? 870  PHE B CA  1
ATOM   2415 C CA  . GLU B 1 19  ? 32.359  0.467   -9.861  1.00 38.45  ? ? ? ? ? ? 871  GLU B CA  1
ATOM   2424 C CA  . LYS B 1 20  ? 30.409  3.510   -11.036 1.00 33.69  ? ? ? ? ? ? 872  LYS B CA  1
ATOM   2433 C CA  . ARG B 1 21  ? 30.400  2.430   -14.663 1.00 36.58  ? ? ? ? ? ? 873  ARG B CA  1
ATOM   2444 C CA  . PHE B 1 22  ? 28.294  -0.647  -13.791 1.00 38.39  ? ? ? ? ? ? 874  PHE B CA  1
ATOM   2455 C CA  . LEU B 1 23  ? 25.763  1.275   -11.703 1.00 32.87  ? ? ? ? ? ? 875  LEU B CA  1
ATOM   2463 C CA  . LYS B 1 24  ? 22.588  1.723   -13.713 1.00 30.22  ? ? ? ? ? ? 876  LYS B CA  1
"""
  import iotbx.bioinformatics
  from iotbx.pdb.amino_acid_codes import three_letter_given_one_letter
  from cctbx.array_family import flex
  sequence_4ehz = iotbx.bioinformatics.sequence("GDIVSEKKPATEVDPTHFEKRFLK")#RIRDLGEGHF"
  pdb_in = iotbx.pdb.input(
    lines=(pdb_atom_site_loop_header+input_4ehz).splitlines(),
    source_info=None)
  pdb_hierarchy = pdb_in.construct_hierarchy()
  cif_block = pdb_hierarchy.as_cif_block_with_sequence(
    sequences=[sequence_4ehz],
    crystal_symmetry=pdb_in.crystal_symmetry())
  assert cif_block['_entity.id'][0] == '1'
  assert cif_block['_entity.type'][0] == 'polymer'
  assert cif_block['_entity.pdbx_number_of_molecules'][0] == '2'
  assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == sequence_4ehz.sequence
  assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == sequence_4ehz.sequence
  assert cif_block['_entity_poly.pdbx_strand_id'] == 'A,B'
  assert approx_equal(flex.int(cif_block['_entity_poly_seq.num']), range(1, 25))
  assert cif_block['_entity_poly_seq.entity_id'].all_eq('1')
  assert list(cif_block['_entity_poly_seq.mon_id']) == [
    three_letter_given_one_letter.get(i) for i in sequence_4ehz.sequence]
  #
  # example with modified amino acid - PTR
  input_3zdi = """\
ATOM   1422 C  CA  . ASN A 1 179 ? -11.025 -26.833 -3.747  1.00 86.68  ? ? ? ? ? ? 213  ASN A CA  1
ATOM   1430 C  CA  . VAL A 1 180 ? -7.831  -26.493 -1.696  1.00 82.40  ? ? ? ? ? ? 214  VAL A CA  1
ATOM   1437 C  CA  . SER A 1 181 ? -8.142  -28.602 1.444   1.00 89.69  ? ? ? ? ? ? 215  SER A CA  1
ATOM   1443 C  CA  . PTR A 1 182 ? -5.406  -26.622 3.177   1.00 88.05  ? ? ? ? ? ? 216  PTR A CA  1
ATOM   1459 C  CA  . ILE A 1 183 ? -7.514  -23.621 4.117   1.00 83.90  ? ? ? ? ? ? 217  ILE A CA  1
ATOM   1467 C  CA  . CYS A 1 184 ? -8.907  -21.533 7.009   1.00 86.39  ? ? ? ? ? ? 218  CYS A CA  1
ATOM   1473 C  CA  . SER A 1 185 ? -6.795  -21.356 10.148  1.00 91.03  ? ? ? ? ? ? 219  SER A CA  1
"""
  sequence_3zdi = iotbx.bioinformatics.sequence("NVSYICSR")
  pdb_in = iotbx.pdb.input(
    lines=(pdb_atom_site_loop_header+input_3zdi).splitlines(),
    source_info=None)
  pdb_hierarchy = pdb_in.construct_hierarchy()
  cif_block = pdb_hierarchy.as_cif_block_with_sequence(
    sequences=[sequence_3zdi],
    crystal_symmetry=pdb_in.crystal_symmetry())
  assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == 'NVS(PTR)ICSR'
  assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == sequence_3zdi.sequence
  assert approx_equal(flex.int(cif_block['_entity_poly_seq.num']), range(1, 9))
  assert list(cif_block['_entity_poly_seq.mon_id']) == [
    'ASN', 'VAL', 'SER', 'PTR', 'ILE', 'CYS', 'SER', 'ARG']
  #
  input_4gln = """\
ATOM   2    C CA  . DTH A 1 1   ? -2.916  5.861  2.629   1.00 16.39 ? ? ? ? ? ? 1   DTH D CA  1
ATOM   9    C CA  . DTY A 1 2   ? 0.533   4.844  3.866   1.00 10.74 ? ? ? ? ? ? 2   DTY D CA  1
ATOM   21   C CA  . DLY A 1 3   ? 3.161   3.111  1.736   1.00 8.24  ? ? ? ? ? ? 3   DLY D CA  1
ATOM   30   C CA  . DLE A 1 4   ? 6.958   3.293  1.625   1.00 7.95  ? ? ? ? ? ? 4   DLE D CA  1
ATOM   38   C CA  . DIL A 1 5   ? 9.053   0.443  0.257   1.00 8.44  ? ? ? ? ? ? 5   DIL D CA  1
ATOM   46   C CA  . DLE A 1 6   ? 12.622  1.402  -0.674  1.00 8.62  ? ? ? ? ? ? 6   DLE D CA  1
ATOM   54   C CA  A DSG A 1 7   ? 14.930  -1.609 -0.756  0.60 11.27 ? ? ? ? ? ? 7   DSG D CA  1
ATOM   55   C CA  B DSG A 1 7   ? 14.934  -1.617 -0.732  0.40 11.77 ? ? ? ? ? ? 7   DSG D CA  1
ATOM   67   C CA  . GLY A 1 8   ? 18.113  -0.249 -2.284  1.00 13.02 ? ? ? ? ? ? 8   GLY D CA  1
ATOM   71   C CA  . DLY A 1 9   ? 21.326  -1.954 -3.288  1.00 17.83 ? ? ? ? ? ? 9   DLY D CA  1
ATOM   80   C CA  . DTH A 1 10  ? 20.765  -0.934 -6.926  1.00 16.38 ? ? ? ? ? ? 10  DTH D CA  1
#
ATOM   472  C CA  . GLU B 2 6   ? 15.798  -6.874 23.843  1.00 31.74 ? ? ? ? ? ? 6   GLU E CA  1
ATOM   477  C CA  . VAL B 2 7   ? 16.644  -3.926 21.599  1.00 15.99 ? ? ? ? ? ? 7   VAL E CA  1
ATOM   484  C CA  . VAL B 2 8   ? 13.767  -1.465 21.234  1.00 10.37 ? ? ? ? ? ? 8   VAL E CA  1
ATOM   491  C CA  . LYS B 2 9   ? 12.953  -1.088 17.521  1.00 8.44  ? ? ? ? ? ? 9   LYS E CA  1
#
HETATM 2537 O O   . HOH E 3 .   ? 8.196   -3.708 8.277   1.00 15.02 ? ? ? ? ? ? 101 HOH D O   1
HETATM 2538 O O   . HOH E 3 .   ? 4.901   -4.298 5.515   1.00 13.08 ? ? ? ? ? ? 102 HOH D O   1
HETATM 2663 O O   . HOH F 3 .   ? 10.535  -2.721 20.049  1.00 15.44 ? ? ? ? ? ? 201 HOH E O   1
HETATM 2664 O O   . HOH F 3 .   ? 0.790   8.695  30.909  1.00 17.06 ? ? ? ? ? ? 202 HOH E O   1
HETATM 2795 O O   . HOH G 3 .   ? 11.265  2.914  43.878  1.00 13.92 ? ? ? ? ? ? 201 HOH F O   1
HETATM 2796 O O   . HOH G 3 .   ? 11.197  11.667 36.108  1.00 17.00 ? ? ? ? ? ? 202 HOH F O   1
"""
  sequence_4gln = [iotbx.bioinformatics.sequence("TYKLILNGKT"),
                   iotbx.bioinformatics.sequence("GQNHHEVVK")]
  pdb_in = iotbx.pdb.input(
    lines=(pdb_atom_site_loop_header+input_4gln).splitlines(),
    source_info=None)
  pdb_hierarchy = pdb_in.construct_hierarchy()
  cif_block = pdb_hierarchy.as_cif_block_with_sequence(
    sequences=sequence_4gln,
    crystal_symmetry=pdb_in.crystal_symmetry())
  assert list(cif_block['_entity.id']) == ['1', '2', '3']
  assert list(cif_block['_entity.type']) == ['polymer', 'polymer', 'water']
  assert approx_equal(flex.int(cif_block['_entity_poly_seq.num']),
                      range(1, 11)+range(1, 10))
  assert list(cif_block['_entity_poly_seq.mon_id']) == [
    'DTH', 'DTY', 'DLY', 'DLE', 'DIL', 'DLE', 'DSG', 'GLY', 'DLY', 'DTH',
    'GLY', 'GLN', 'ASN', 'HIS', 'HIS', 'GLU', 'VAL', 'VAL', 'LYS']
  assert list(cif_block['_entity_poly.pdbx_seq_one_letter_code']) == [
    '(DTH)(DTY)(DLY)(DLE)(DIL)(DLE)(DSG)G(DLY)(DTH)', sequence_4gln[1].sequence]
  assert list(cif_block['_entity_poly.pdbx_seq_one_letter_code_can']) == [
    sequence_4gln[0].sequence, sequence_4gln[1].sequence]
  assert approx_equal(flex.int(cif_block['_atom_site.label_entity_id']),
                      [1]*11 + [2]*4 + [3]*6)
  assert list(cif_block['_atom_site.label_seq_id']) == [
    '1', '2', '3', '4', '5', '6', '7', '7', '8', '9', '10', '6', '7', '8', '9',
     '.', '.', '.', '.', '.', '.']
  #
  input_1ezu = """\
ATOM   3971 C  CA  . VAL D 2 16  ? 24.971  -4.493  -3.652  1.00 33.12  ? ? ? ? ? ? 731 VAL D CA  1
ATOM   3978 C  CA  . SER D 2 17  ? 27.194  -3.056  -0.946  1.00 35.47  ? ? ? ? ? ? 732 SER D CA  1
ATOM   3984 C  CA  . LEU D 2 18  ? 26.541  0.123   0.961   1.00 45.29  ? ? ? ? ? ? 733 LEU D CA  1
ATOM   3992 C  CA  . ASN D 2 19  ? 29.777  2.032   1.598   1.00 53.09  ? ? ? ? ? ? 734 ASN D CA  1
ATOM   4000 C  CA  . SER D 2 20  ? 30.737  4.963   3.775   1.00 61.92  ? ? ? ? ? ? 737 SER D CA  1
ATOM   4006 C  CA  . GLY D 2 21  ? 34.478  4.622   4.207   1.00 62.21  ? ? ? ? ? ? 738 GLY D CA  1
ATOM   4010 C  CA  . TYR D 2 22  ? 33.903  0.885   4.483   1.00 54.81  ? ? ? ? ? ? 739 TYR D CA  1
"""
  sequence_1ezu = iotbx.bioinformatics.sequence('VSLNSGY')
  pdb_in = iotbx.pdb.input(
    lines=(pdb_atom_site_loop_header+input_1ezu).splitlines(),
    source_info=None)
  pdb_hierarchy = pdb_in.construct_hierarchy()
  cif_block = pdb_hierarchy.as_cif_block_with_sequence(
    sequences=[sequence_1ezu])
  assert list(cif_block['_entity_poly_seq.mon_id']) == [
    'VAL', 'SER', 'LEU', 'ASN', 'SER', 'GLY', 'TYR']
  assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == sequence_1ezu.sequence
  assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == sequence_1ezu.sequence
  assert list(cif_block['_atom_site.auth_seq_id']) == [
    '731', '732', '733', '734', '737', '738', '739']
  assert list(cif_block['_atom_site.label_seq_id']) == [
    '1', '2', '3', '4', '5', '6', '7']
  input_2hok = """\
ATOM   301  P  P     . C   A 1 15 ? 15.802 44.045 80.094 1.00 59.36 ? ? ? ? ? ? 23  C   A P     1
ATOM   321  P  P     . C   A 1 16 ? 12.286 47.301 82.617 1.00 68.27 ? ? ? ? ? ? 24  C   A P     1
ATOM   341  P  P     . U   A 1 17 ? 6.815  51.648 82.739 1.00 78.03 ? ? ? ? ? ? 25  U   A P     1
ATOM   361  P  P     . G   A 1 21 ? 7.042  52.289 91.645 1.00 96.25 ? ? ? ? ? ? 29  G   A P     1
ATOM   384  P  P     . C   A 1 22 ? 7.024  46.751 90.841 1.00 84.69 ? ? ? ? ? ? 30  C   A P     1
ATOM   404  P  P     . G   A 1 23 ? 7.477  40.933 88.377 1.00 81.65 ? ? ? ? ? ? 31  G   A P     1
"""
  sequence_2hok = iotbx.bioinformatics.sequence("CCUUCUGCG")
  pdb_in = iotbx.pdb.input(
    lines=(pdb_atom_site_loop_header+input_2hok).splitlines(),
    source_info=None)
  pdb_hierarchy = pdb_in.construct_hierarchy()
  cif_block = pdb_hierarchy.as_cif_block_with_sequence(
    sequences=[sequence_2hok])
  assert list(cif_block['_entity_poly_seq.mon_id']) == [
    'C', 'C', 'U', 'U', 'C', 'U', 'G', 'C', 'G']
  assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == sequence_2hok.sequence
  assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == sequence_2hok.sequence
  assert list(cif_block['_atom_site.auth_seq_id']) == [
    '23', '24', '25', '29', '30', '31']
  assert list(cif_block['_atom_site.label_seq_id']) == [
    '1', '2', '3', '7', '8', '9']
  #
  input_3tpy = """\
ATOM      2  CA  GLN A  24       2.586  40.220  34.036  1.00 41.54           C
ATOM      8  CA  LYS A  25       1.265  43.698  34.904  1.00 25.47           C
ATOM     17  CA  GLN A  26       3.834  45.984  36.538  1.00 22.91           C
ATOM     26  CA  PRO A  27       2.835  48.614  39.135  1.00 19.20           C
ATOM     33  CA  ILE A  28       3.972  52.206  39.293  1.00 18.70           C
ATOM     41  CA  SER A  29       6.403  51.332  42.097  1.00 22.63           C
TER
HETATM  852 MG    MG A 999     -12.415  61.451  32.421  0.70 28.10          MG
HETATM  853  C   TRS A 153      -0.078  70.151  24.773  0.33 24.86           C
HETATM  877  PA BDUP A 777      -9.339  60.563  31.137  0.70 19.64           P
HETATM  881  PB BDUP A 777     -11.768  59.969  29.491  0.70 27.76           P
HETATM  885  PG BDUP A 777     -13.098  58.529  31.620  0.70 33.91           P
HETATM  905  P  AUMP A 154      -9.010  60.358  31.334  0.30 11.42           P
HETATM  909  O   HOH A 155      -0.197  60.723  27.343  1.00 17.17           O
HETATM  910  O   HOH A 156     -10.293  62.567  35.648  1.00 19.43           O
"""
  sequence_3tpy = iotbx.bioinformatics.sequence("QKQPIS")
  pdb_in = iotbx.pdb.input(lines=(input_3tpy).splitlines(), source_info=None)
  pdb_hierarchy = pdb_in.construct_hierarchy()
  cif_block = pdb_hierarchy.as_cif_block_with_sequence(
    sequences=[sequence_3tpy])
  assert list(cif_block["_entity.type"]) == [
    'polymer', 'non-polymer', 'non-polymer', 'non-polymer', 'non-polymer', 'water']
  assert list(cif_block["_atom_site.label_entity_id"]) == [
    '1', '1', '1', '1', '1', '1', '2', '3', '4', '4', '4', '5', '6', '6']
  assert list(cif_block["_atom_site.label_seq_id"]) == [
    '1', '2', '3', '4', '5', '6', '.', '.', '.', '.', '.', '.', '.', '.']
  #
  input_3tgr = """\
ATOM   2449  CA  GLY A 459     -17.536  10.137  41.979  1.00181.52           C
ATOM   2453  CA  GLN A 460     -15.862  12.780  44.128  1.00192.51           C
ATOM   2462  CA  ASN A 463     -19.198   8.054  50.455  1.00180.96           C
ATOM   2470  CA  ASP A 464     -19.235   4.661  52.197  1.00143.07           C
ATOM   2478  CA  THR A 465     -20.893   2.988  49.198  1.00 91.96           C
"""
  sequence_3tgr = iotbx.bioinformatics.sequence("DGGQSNETNDTET")
  pdb_in = iotbx.pdb.input(lines=(input_3tgr).splitlines(), source_info=None)
  pdb_hierarchy = pdb_in.construct_hierarchy()
  cif_block = pdb_hierarchy.as_cif_block_with_sequence(
    sequences=[sequence_3tgr])
  assert list(cif_block["_entity_poly_seq.mon_id"]) == [
    'ASP', 'GLY', 'GLY', 'GLN', 'SER', 'ASN', 'GLU', 'THR', 'ASN', 'ASP', 'THR',
    'GLU', 'THR']
  assert list(cif_block["_atom_site.label_comp_id"]) == [
    'GLY', 'GLN', 'ASN', 'ASP', 'THR']
  assert list(cif_block["_atom_site.label_seq_id"]) == ['3', '4', '9', '10', '11']
  assert cif_block["_entity_poly.pdbx_seq_one_letter_code"][0] == 'DGGQSNETNDTET'
  input_2im9 = """\
ATOM   2423  CA  PRO A 345       2.114  16.158   0.161  1.00 29.14           C
ATOM   2430  CA  VAL A 346      -1.223  17.837   0.938  1.00 31.05           C
ATOM   2437  CA  CYS A 349      -4.081  15.852   7.014  0.50 28.57           C
ATOM   2443  CA  GLN A 350      -6.176  14.041   9.639  0.50 30.62           C
ATOM   2452  CA  LEU A 351      -6.631  10.729   7.797  0.50 31.53           C
ATOM   2460  CA  PHE A 352      -5.220   9.172   4.620  0.50 31.95           C
"""
  sequence_2im9 = iotbx.bioinformatics.sequence("SSPTIKGINIQVVLPEKPVSNGCQLFDIR")
  pdb_in = iotbx.pdb.input(lines=(input_2im9).splitlines(), source_info=None)
  pdb_hierarchy = pdb_in.construct_hierarchy()
  cif_block = pdb_hierarchy.as_cif_block_with_sequence(
    sequences=[sequence_2im9])
  assert list(cif_block["_entity_poly_seq.mon_id"]) == [
    'SER', 'SER', 'PRO', 'THR', 'ILE', 'LYS', 'GLY', 'ILE', 'ASN', 'ILE', 'GLN',
    'VAL', 'VAL', 'LEU', 'PRO', 'GLU', 'LYS', 'PRO', 'VAL', 'SER', 'ASN', 'GLY',
    'CYS', 'GLN', 'LEU', 'PHE', 'ASP', 'ILE', 'ARG']
  assert list(cif_block["_atom_site.label_seq_id"]) == [
    '18', '19', '23', '24', '25', '26']
def correct_sequence (pdb_hierarchy,
    sequences,
    truncate_to_cbeta=False,
    out=sys.stdout) :
  """
  Modify the sequence for the pdb hierarchy to match that of the aligned
  sequence.  This will remove incompatible atoms; the sidechains will still
  need to be extended separated.  For proteins only - mismatches in nucleic
  acids will only result in a warning.

  :param pdb_hierarchy: iotbx.pdb.hierarchy.root object
  :param sequences: list of iotbx.bioinformatics.sequence objects
  :param trucate_to_cbeta: chop off entire sidechain to C-beta (default: leave
                           common atoms in place)
  :param out: output filehandle (default = stdout)
  :returns: number of atom_group objects renamed
  """
  from mmtbx.monomer_library import idealized_aa
  import mmtbx.validation.sequence
  from iotbx.pdb.amino_acid_codes import three_letter_given_one_letter
  seq_validation = mmtbx.validation.sequence.validation(
    pdb_hierarchy=pdb_hierarchy,
    sequences=sequences,
    log=out)
  for chain_seq in seq_validation.chains :
    if (chain_seq.chain_type == mmtbx.validation.sequence.NUCLEIC_ACID) :
      if (len(chain_seq.mismatch) > 0) :
        print >> out, \
          "  WARNING: will skip %d mismatches in nucleic acid chain '%s'" % \
          chain_seq.chain_id
  res_dict = idealized_aa.residue_dict()
  expected_names = {}
  for resname in res_dict.keys() :
    if (not "_h" in resname) :
      ideal_res = res_dict[resname]
      expected_names[resname] = set([ a.name for a in ideal_res.atoms() ])
  n_changed = 0
  for chain in pdb_hierarchy.only_model().chains() :
    if (not chain.is_protein()) :
      continue
    for chain_seq in seq_validation.chains :
      if (chain.id == chain_seq.chain_id) and (len(chain_seq.mismatch) > 0) :
        for residue_group in chain.residue_groups() :
          resid = residue_group.resid()
          if (resid in chain_seq.mismatch) :
            idx = chain_seq.mismatch.index(resid)
            new_code = chain_seq.actual_code[idx]
            new_resname = three_letter_given_one_letter.get(new_code)
            if (new_resname is not None) :
              expected_atoms = expected_names[new_resname.lower()]
              if (truncate_to_cbeta) :
                expected_atoms = expected_names["ala"]
              for atom_group in residue_group.atom_groups() :
                n_changed += 1
                n_removed = 0
                atom_group.resname = new_resname
                for atom in atom_group.atoms() :
                  if (not atom.name in expected_atoms) :
                    atom_group.remove_atom(atom)
                    n_removed += 1
              print >> out, "  chain '%s' %s %s --> %s (%d atoms removed)" % \
                (chain.id, resid, residue_group.atom_groups()[0].resname,
                 new_resname, n_removed)
  pdb_hierarchy.atoms().reset_i_seq()
  return n_changed