Esempio n. 1
0
  def __init__ (self,
      model,
      pdb_hierarchy=None,   # keep for mmtbx.validation_summary (multiple models)
      fmodel=None,
      fmodel_neutron=None,
      sequences=None,
      flags=None,
      header_info=None,
      raw_data=None,
      unmerged_data=None,
      keep_hydrogens=True,
      nuclear=False,
      save_probe_unformatted_file=None,
      show_hydrogen_outliers=False,
      min_cc_two_fofc=0.8,
      n_bins_data=10,
      count_anomalous_pairs_separately=False,
      use_internal_variance=True,
      outliers_only=True,
      use_pdb_header_resolution_cutoffs=False,
      file_name=None,
      ligand_selection=None,
      rotamer_library="8000",
      map_params=None) :
    assert rotamer_library == "8000", "data_version given to RotamerEval not recognized."
    for name in self.__slots__ :
      setattr(self, name, None)

    # use objects from model
    self.model = model
    if (self.model is not None):
      pdb_hierarchy = self.model.get_hierarchy()
      xray_structure = self.model.get_xray_structure()
      geometry_restraints_manager = self.model.get_restraints_manager().geometry
      crystal_symmetry = self.model.crystal_symmetry()
      all_chain_proxies = self.model.all_chain_proxies
    else:
      assert (pdb_hierarchy is not None)
      xray_structure = None
      geometry_restraints_manager = None
      crystal_symmetry = None
      all_chain_proxies = None

    # very important - the i_seq attributes may be extracted later
    pdb_hierarchy.atoms().reset_i_seq()
    self.pdb_hierarchy = pdb_hierarchy
    if (xray_structure is None) :
      if (fmodel is not None) :
        xray_structure = fmodel.xray_structure
      elif (crystal_symmetry is not None) :
        xray_structure = pdb_hierarchy.extract_xray_structure(
          crystal_symmetry=crystal_symmetry)
    self.crystal_symmetry = crystal_symmetry
    if (crystal_symmetry is None) and (fmodel is not None) :
      self.crystal_symmetry = fmodel.f_obs().crystal_symmetry()

    # use maps (fmodel is not used)
    # run earlier since pdb_hierarchy gets modified
    use_maps = False
    if (map_params is not None):
      use_maps = ( (map_params.input.maps.map_file_name) or
                   ( (map_params.input.maps.map_coefficients_file_name) and
                     (map_params.input.maps.map_coefficients_label) ) )
    if (use_maps):
      if (flags.real_space):
        self.real_space = experimental.real_space(
          fmodel=None,
          model=self.model,
          cc_min=min_cc_two_fofc,
          molprobity_map_params=map_params.input.maps)
      if (flags.waters):
        self.waters = waters.waters(
          pdb_hierarchy=pdb_hierarchy,
          xray_structure=xray_structure,
          fmodel=None,
          collect_all=True,
          molprobity_map_params=map_params.input.maps)

    self.header_info = header_info
    if (flags is None) :
      flags = molprobity_flags()
    import mmtbx.model.statistics
    self.model_statistics_geometry = mmtbx.model.statistics.geometry(
      pdb_hierarchy               = pdb_hierarchy,
      geometry_restraints_manager = geometry_restraints_manager,
      use_hydrogens               = keep_hydrogens,
      use_nuclear                 = nuclear)
    self.model_statistics_geometry_result = \
      self.model_statistics_geometry.result()
    self.ramalyze  = self.model_statistics_geometry_result.ramachandran.ramalyze
    self.omegalyze = self.model_statistics_geometry_result.omega.omegalyze
    self.rotalyze  = self.model_statistics_geometry_result.rotamer.rotalyze
    self.cbetadev  = self.model_statistics_geometry_result.c_beta.cbetadev
    self.clashes   = self.model_statistics_geometry_result.clash.clashes
    if pdb_hierarchy.contains_protein() :
      self.find_missing_atoms(out=null_out())
      if (flags.nqh) :
        self.nqh_flips = clashscore.nqh_flips(
          pdb_hierarchy=pdb_hierarchy)
    if (pdb_hierarchy.contains_rna() and flags.rna and
        libtbx.env.has_module(name="suitename")) :
      if (geometry_restraints_manager is not None) :
        self.rna = rna_validate.rna_validation(
          pdb_hierarchy=pdb_hierarchy,
          geometry_restraints_manager=geometry_restraints_manager,
          outliers_only=outliers_only,
          params=None)
    if (flags.model_stats) and (xray_structure is not None) :
      self.model_stats = model_properties.model_statistics(
        pdb_hierarchy=pdb_hierarchy,
        xray_structure=xray_structure,
        all_chain_proxies=all_chain_proxies,
        ignore_hd=(not nuclear),
        ligand_selection=ligand_selection)
    if (geometry_restraints_manager is not None) and (flags.restraints) :
      assert (xray_structure is not None)
      self.restraints = restraints.combined(
        pdb_hierarchy=pdb_hierarchy,
        xray_structure=xray_structure,
        geometry_restraints_manager=geometry_restraints_manager,
        ignore_hd=(not nuclear),
        cdl=getattr(all_chain_proxies, "use_cdl", None))
    if (sequences is not None) and (flags.seq) :
      self.sequence = sequence.validation(
        pdb_hierarchy=pdb_hierarchy,
        sequences=sequences,
        log=null_out(),
        include_secondary_structure=True,
        extract_coordinates=True)

    if (fmodel is not None) :
      if (use_pdb_header_resolution_cutoffs) and (header_info is not None) :
        fmodel = fmodel.resolution_filter(
          d_min=header_info.d_min,
          d_max=header_info.d_max)
      if (flags.rfactors) :
        self.data_stats = experimental.data_statistics(fmodel,
          raw_data=raw_data,
          n_bins=n_bins_data,
          count_anomalous_pairs_separately=count_anomalous_pairs_separately)

      if (not use_maps): # if maps are used, keep previous results
        if (flags.real_space):
          self.real_space = experimental.real_space(
            model=model,
            fmodel=fmodel,
            cc_min=min_cc_two_fofc)
        if (flags.waters) :
          self.waters = waters.waters(
            pdb_hierarchy=pdb_hierarchy,
            xray_structure=xray_structure,
            fmodel=fmodel,
            collect_all=True)

      if (unmerged_data is not None) :
        self.merging = experimental.merging_and_model_statistics(
          f_obs=fmodel.f_obs(),
          f_model=fmodel.f_model(),
          r_free_flags=fmodel.r_free_flags(),
          unmerged_i_obs=unmerged_data,
          anomalous=count_anomalous_pairs_separately,
          use_internal_variance=use_internal_variance,
          n_bins=n_bins_data)
      if (flags.xtriage) :
        import mmtbx.scaling.xtriage
        f_model = abs(fmodel.f_model()).set_observation_type_xray_amplitude()
        if (raw_data is not None) :
          f_model, obs = f_model.common_sets(other=raw_data)
        else :
          obs = fmodel.f_obs()
        self.xtriage = mmtbx.scaling.xtriage.xtriage_analyses(
          miller_obs=obs,
          miller_calc=f_model,
          unmerged_obs=unmerged_data, # XXX some redundancy here...
          text_out=null_out())
    if (fmodel_neutron is not None) and (flags.rfactors) :
      self.neutron_stats = experimental.data_statistics(fmodel_neutron,
        n_bins=n_bins_data,
        count_anomalous_pairs_separately=False)
    if (pdb_hierarchy.models_size() == 1) :
      self._multi_criterion = multi_criterion_view(pdb_hierarchy)

    # wilson B
    self.wilson_b = None
    if (fmodel is not None):
      self.wilson_b = fmodel.wilson_b()
    elif (fmodel_neutron is not None):
      self.wilson_b = fmodel_neutron.wilson_b()

    # validate hydrogens
    self.hydrogens = None
    if self.model is not None and self.model.has_hd():
      # import here to avoid circular import issues
      from mmtbx.hydrogens.validate_H import validate_H, validate_H_results
      hydrogens = validate_H(model, nuclear)
      hydrogens.validate_inputs()
      hydrogens.run()
      self.hydrogens = validate_H_results(hydrogens.get_results())

    # write probe file if needed (CLI and GUI)
    if (save_probe_unformatted_file is not None):
      pcm = self.clashes.probe_clashscore_manager
      try:
        with open(save_probe_unformatted_file, 'w') as f:
          f.write(pcm.probe_unformatted)
        self.clashes.probe_file = save_probe_unformatted_file
      except IOError as err:
        raise Sorry('%s could not be written correctly.\n%s' %
                    (save_probe_unformatted_file, err))
Esempio n. 2
0
  def __init__ (self,
      pdb_hierarchy,
      xray_structure=None,
      fmodel=None,
      fmodel_neutron=None,
      geometry_restraints_manager=None,
      crystal_symmetry=None,
      sequences=None,
      flags=None,
      header_info=None,
      raw_data=None,
      unmerged_data=None,
      all_chain_proxies=None,
      keep_hydrogens=True,
      nuclear=False,
      save_probe_unformatted_file=None,
      show_hydrogen_outliers=False,
      min_cc_two_fofc=0.8,
      n_bins_data=10,
      count_anomalous_pairs_separately=False,
      use_internal_variance=True,
      outliers_only=True,
      use_pdb_header_resolution_cutoffs=False,
      file_name=None,
      ligand_selection=None,
      rotamer_library="8000",
      map_params=None) :
    assert rotamer_library == "8000", "data_version given to RotamerEval not recognized."
    for name in self.__slots__ :
      setattr(self, name, None)
    # very important - the i_seq attributes may be extracted later
    pdb_hierarchy.atoms().reset_i_seq()
    self.pdb_hierarchy = pdb_hierarchy
    if (xray_structure is None) :
      if (fmodel is not None) :
        xray_structure = fmodel.xray_structure
      elif (crystal_symmetry is not None) :
        xray_structure = pdb_hierarchy.extract_xray_structure(
          crystal_symmetry=crystal_symmetry)
    self.crystal_symmetry = crystal_symmetry
    if (crystal_symmetry is None) and (fmodel is not None) :
      self.crystal_symmetry = fmodel.f_obs().crystal_symmetry()
    self.header_info = header_info
    if (flags is None) :
      flags = molprobity_flags()
    if pdb_hierarchy.contains_protein() :
      if (flags.ramalyze) :
        self.ramalyze = ramalyze.ramalyze(
          pdb_hierarchy=pdb_hierarchy,
          outliers_only=outliers_only,
          out=null_out(),
          quiet=True)
##### omegalyze ################################################################
      if (flags.omegalyze) :
        self.omegalyze = omegalyze.omegalyze(
          pdb_hierarchy=pdb_hierarchy,
          nontrans_only=outliers_only,
          out=null_out(),
          quiet=True)
##### omegalyze ################################################################
      if (flags.rotalyze) :
        self.rotalyze = rotalyze.rotalyze(
          pdb_hierarchy=pdb_hierarchy,
          data_version=rotamer_library,
          outliers_only=outliers_only,
          out=null_out(),
          quiet=True)
      if (flags.cbetadev) :
        self.cbetadev = cbetadev.cbetadev(
          pdb_hierarchy=pdb_hierarchy,
          outliers_only=outliers_only,
          out=null_out(),
          quiet=True)
      if (flags.nqh) :
        self.nqh_flips = clashscore.nqh_flips(
          pdb_hierarchy=pdb_hierarchy)
    if (pdb_hierarchy.contains_rna() and flags.rna and
        libtbx.env.has_module(name="suitename")) :
      if (geometry_restraints_manager is not None) :
        self.rna = rna_validate.rna_validation(
          pdb_hierarchy=pdb_hierarchy,
          geometry_restraints_manager=geometry_restraints_manager,
          outliers_only=outliers_only,
          params=None)
    if (flags.clashscore) :
      self.clashes = clashscore.clashscore(
        pdb_hierarchy=pdb_hierarchy,
        save_probe_unformatted_file=save_probe_unformatted_file,
        nuclear=nuclear,
        keep_hydrogens=keep_hydrogens,
        out=null_out(),
        verbose=False)
    if (flags.model_stats) and (xray_structure is not None) :
      self.model_stats = model_properties.model_statistics(
        pdb_hierarchy=pdb_hierarchy,
        xray_structure=xray_structure,
        all_chain_proxies=all_chain_proxies,
        ignore_hd=(not nuclear),
        ligand_selection=ligand_selection)
    if (geometry_restraints_manager is not None) and (flags.restraints) :
      assert (xray_structure is not None)
      self.restraints = restraints.combined(
        pdb_hierarchy=pdb_hierarchy,
        xray_structure=xray_structure,
        geometry_restraints_manager=geometry_restraints_manager,
        ignore_hd=(not nuclear),
        cdl=getattr(all_chain_proxies, "use_cdl", None))
    if (sequences is not None) and (flags.seq) :
      self.sequence = sequence.validation(
        pdb_hierarchy=pdb_hierarchy,
        sequences=sequences,
        log=null_out(),
        include_secondary_structure=True,
        extract_coordinates=True)

    # use maps (fmodel is not used)
    use_maps = False
    if (map_params is not None):
      use_maps = ( (map_params.input.maps.map_file_name) or
                   ( (map_params.input.maps.map_coefficients_file_name) and
                     (map_params.input.maps.map_coefficients_label) ) )
    if (use_maps):
      if (flags.real_space):
        self.real_space = experimental.real_space(
          fmodel=None,
          pdb_hierarchy=pdb_hierarchy,
          cc_min=min_cc_two_fofc,
          molprobity_map_params=map_params.input.maps)
      if (flags.waters):
        self.waters = waters.waters(
          pdb_hierarchy=pdb_hierarchy,
          xray_structure=xray_structure,
          fmodel=None,
          collect_all=True,
          molprobity_map_params=map_params.input.maps)

    if (fmodel is not None) :
      if (use_pdb_header_resolution_cutoffs) and (header_info is not None) :
        fmodel = fmodel.resolution_filter(
          d_min=header_info.d_min,
          d_max=header_info.d_max)
      if (flags.rfactors) :
        self.data_stats = experimental.data_statistics(fmodel,
          raw_data=raw_data,
          n_bins=n_bins_data,
          count_anomalous_pairs_separately=count_anomalous_pairs_separately)

      if (not use_maps): # if maps are used, keep previous results
        if (flags.real_space):
          self.real_space = experimental.real_space(
            fmodel=fmodel,
            pdb_hierarchy=pdb_hierarchy,
            cc_min=min_cc_two_fofc)
        if (flags.waters) :
          self.waters = waters.waters(
            pdb_hierarchy=pdb_hierarchy,
            xray_structure=xray_structure,
            fmodel=fmodel,
            collect_all=True)

      if (unmerged_data is not None) :
        self.merging = experimental.merging_and_model_statistics(
          f_obs=fmodel.f_obs(),
          f_model=fmodel.f_model(),
          r_free_flags=fmodel.r_free_flags(),
          unmerged_i_obs=unmerged_data,
          anomalous=count_anomalous_pairs_separately,
          use_internal_variance=use_internal_variance,
          n_bins=n_bins_data)
      if (flags.xtriage) :
        import mmtbx.scaling.xtriage
        f_model = abs(fmodel.f_model()).set_observation_type_xray_amplitude()
        if (raw_data is not None) :
          f_model, obs = f_model.common_sets(other=raw_data)
        else :
          obs = fmodel.f_obs()
        self.xtriage = mmtbx.scaling.xtriage.xtriage_analyses(
          miller_obs=obs,
          miller_calc=f_model,
          unmerged_obs=unmerged_data, # XXX some redundancy here...
          text_out=null_out())
    if (fmodel_neutron is not None) and (flags.rfactors) :
      self.neutron_stats = experimental.data_statistics(fmodel_neutron,
        n_bins=n_bins_data,
        count_anomalous_pairs_separately=False)
    if (pdb_hierarchy.models_size() == 1) :
      self._multi_criterion = multi_criterion_view(pdb_hierarchy)
Esempio n. 3
0
def exercise():
    import libtbx.utils
    if (libtbx.utils.detect_multiprocessing_problem() is not None):
        print("multiprocessing not available, skipping this test")
        return
    if (os.name == "nt"):
        print(
            "easy_mp fixed_func not supported under Windows, skipping this test"
        )
        return
    from mmtbx.validation.sequence import validation, get_sequence_n_copies, \
      get_sequence_n_copies_from_files
    import iotbx.bioinformatics
    import iotbx.pdb
    from iotbx import file_reader
    import libtbx.load_env  # import dependency
    from libtbx.test_utils import Exception_expected, contains_lines, approx_equal
    from six.moves import cStringIO as StringIO
    pdb_in = iotbx.pdb.input(source_info=None,
                             lines="""\
ATOM      2  CA  ARG A  10      -6.299  36.344   7.806  1.00 55.20           C
ATOM     25  CA  TYR A  11      -3.391  33.962   7.211  1.00 40.56           C
ATOM     46  CA  ALA A  12      -0.693  34.802   4.693  1.00 67.95           C
ATOM     56  CA  ALA A  13       0.811  31.422   3.858  1.00 57.97           C
ATOM     66  CA  GLY A  14       4.466  31.094   2.905  1.00 49.24           C
ATOM     73  CA  ALA A  15       7.163  28.421   2.671  1.00 54.70           C
ATOM     83  CA  ILE A  16       6.554  24.685   2.957  1.00 51.79           C
ATOM    102  CA  LEU A  17       7.691  23.612   6.406  1.00 42.30           C
ATOM    121  CA  PTY A  18       7.292  19.882   5.861  1.00 36.68           C
ATOM    128  CA  PHE A  19       5.417  16.968   4.327  1.00 44.99           C
ATOM    148  CA  GLY A  20       3.466  14.289   6.150  1.00 41.99           C
ATOM    155  CA  GLY A  21       1.756  11.130   4.965  1.00 35.77           C
ATOM    190  CA  ALA A  24       1.294  19.658   3.683  1.00 47.02           C
ATOM    200  CA  VAL A  24A      2.361  22.009   6.464  1.00 37.13           C
ATOM    216  CA  HIS A  25       2.980  25.633   5.535  1.00 42.52           C
ATOM    234  CA  LEU A  26       4.518  28.425   7.577  1.00 47.63           C
ATOM    253  CA  ALA A  27       2.095  31.320   7.634  1.00 38.61           C
ATOM    263  CA  ARG A  28       1.589  34.719   9.165  1.00 37.04           C
END""")
    seq1 = iotbx.bioinformatics.sequence(
        "MTTPSHLSDRYELGEILGFGGMSEVHLARD".lower())
    v = validation(pdb_hierarchy=pdb_in.construct_hierarchy(),
                   sequences=[seq1],
                   log=null_out(),
                   nproc=1)
    out = StringIO()
    v.show(out=out)
    assert contains_lines(
        out.getvalue(), """\
  sequence identity: 76.47%
  13 residue(s) missing from PDB chain (9 at start, 1 at end)
  2 gap(s) in chain
  4 mismatches to sequence
    residue IDs:  12 13 15 24""")
    cif_block = v.sequence_as_cif_block()
    assert list(cif_block['_struct_ref.pdbx_seq_one_letter_code']) == [
        ';MTTPSHLSDRYELGEILGFGGMSEVHLARD\n;'
    ]
    # assert approx_equal(cif_block['_struct_ref_seq.pdbx_auth_seq_align_beg'],
    #                     ['10', '14', '16', '19', '24'])
    # assert approx_equal(cif_block['_struct_ref_seq.pdbx_auth_seq_align_end'],
    #                     ['11', '14', '17', '21', '28'])
    # assert approx_equal(cif_block['_struct_ref_seq.db_align_beg'],
    #                     ['10', '14', '16', '19', '25'])
    # assert approx_equal(cif_block['_struct_ref_seq.db_align_end'],
    #                     ['11', '14', '17', '21', '29'])
    # assert cif_block['_struct_ref_seq.pdbx_seq_align_beg_ins_code'][4] == 'A'
    seq2 = iotbx.bioinformatics.sequence("MTTPSHLSDRYELGEILGFGGMSEVHLA")
    v = validation(pdb_hierarchy=pdb_in.construct_hierarchy(),
                   sequences=[seq2],
                   log=null_out(),
                   nproc=1)
    out = StringIO()
    v.show(out=out)
    assert contains_lines(
        out.getvalue(), """\
  1 residues not found in sequence
    residue IDs:  28""")
    try:
        v = validation(pdb_hierarchy=pdb_in.construct_hierarchy(),
                       sequences=[],
                       log=null_out(),
                       nproc=1)
    except AssertionError:
        pass
    else:
        raise Exception_expected
    cif_block = v.sequence_as_cif_block()
    print(list(cif_block['_struct_ref.pdbx_seq_one_letter_code']))
    assert list(cif_block['_struct_ref.pdbx_seq_one_letter_code']) == [
        ';MTTPSHLSDRYELGEILGFGGMSEVHLA\n;'
    ]
    # assert approx_equal(cif_block['_struct_ref_seq.pdbx_auth_seq_align_end'],
    #                     ['11', '14', '17', '21', '27'])
    # assert approx_equal(cif_block['_struct_ref_seq.db_align_end'],
    #                     ['11', '14', '17', '21', '28'])
    #
    pdb_in2 = iotbx.pdb.input(source_info=None,
                              lines="""\
ATOM      2  CA  ARG A  10      -6.299  36.344   7.806  1.00 55.20           C
ATOM     25  CA  TYR A  11      -3.391  33.962   7.211  1.00 40.56           C
ATOM     46  CA  ALA A  12      -0.693  34.802   4.693  1.00 67.95           C
ATOM     56  CA  ALA A  13       0.811  31.422   3.858  1.00 57.97           C
ATOM     66  CA  GLY A  14       4.466  31.094   2.905  1.00 49.24           C
ATOM     73  CA  ALA A  15       7.163  28.421   2.671  1.00 54.70           C
ATOM     83  CA  ILE A  16       6.554  24.685   2.957  1.00 51.79           C
ATOM    102  CA  LEU A  17       7.691  23.612   6.406  1.00 42.30           C
TER
ATOM   1936  P     G B   2     -22.947 -23.615  15.323  1.00123.20           P
ATOM   1959  P     C B   3     -26.398 -26.111  19.062  1.00110.06           P
ATOM   1979  P     U B   4     -29.512 -30.638  21.164  1.00101.06           P
ATOM   1999  P     C B   5     -30.524 -36.109  21.527  1.00 92.76           P
ATOM   2019  P     U B   6     -28.684 -41.458  21.223  1.00 87.42           P
ATOM   2062  P     G B   8     -18.396 -45.415  21.903  1.00 80.35           P
ATOM   2085  P     A B   9     -13.852 -43.272  24.156  1.00 77.76           P
ATOM   2107  P     G B  10      -8.285 -44.242  26.815  1.00 79.86           P
END
""")
    seq3 = iotbx.bioinformatics.sequence("AGCUUUGGAG")
    v = validation(pdb_hierarchy=pdb_in2.construct_hierarchy(),
                   sequences=[seq2, seq3],
                   log=null_out(),
                   nproc=1,
                   extract_coordinates=True)
    out = StringIO()
    v.show(out=out)
    cif_block = v.sequence_as_cif_block()
    assert approx_equal(cif_block['_struct_ref.pdbx_seq_one_letter_code'],
                        [';MTTPSHLSDRYELGEILGFGGMSEVHLA\n;', ';AGCUUUGGAG\n;'])
    # assert approx_equal(cif_block['_struct_ref_seq.pdbx_auth_seq_align_beg'],
    #                     ['10', '14', '16', '2', '6', '8'])
    # assert approx_equal(cif_block['_struct_ref_seq.pdbx_auth_seq_align_end'],
    #                     ['11', '14', '17', '4', '6', '10'])
    assert (len(v.chains[0].get_outliers_table()) == 3)
    assert (len(v.get_table_data()) == 4)
    assert approx_equal(
        v.chains[0].get_mean_coordinate_for_alignment_range(11, 11),
        (-0.693, 34.802, 4.693))
    assert approx_equal(
        v.chains[0].get_mean_coordinate_for_alignment_range(11, 14),
        (2.93675, 31.43475, 3.53175))
    assert (v.chains[0].get_highlighted_residues() == [11, 12, 14])
    assert contains_lines(
        out.getvalue(), """\
  3 mismatches to sequence
    residue IDs:  12 13 15""")
    assert contains_lines(
        out.getvalue(), """\
  sequence identity: 87.50%
  2 residue(s) missing from PDB chain (1 at start, 0 at end)
  1 gap(s) in chain
  1 mismatches to sequence
    residue IDs:  5""")
    s = easy_pickle.dumps(v)
    seq4 = iotbx.bioinformatics.sequence("")
    try:
        v = validation(pdb_hierarchy=pdb_in2.construct_hierarchy(),
                       sequences=[seq4],
                       log=null_out(),
                       nproc=1,
                       extract_coordinates=True)
    except AssertionError:
        pass
    else:
        raise Exception_expected
    # check that nucleic acid chain doesn't get aligned against protein sequence
    pdb_in = iotbx.pdb.input(source_info=None,
                             lines="""\
ATOM  18932  P  B DG D   1     -12.183  60.531  25.090  0.50364.79           P
ATOM  18963  P  B DG D   2      -9.738  55.258  20.689  0.50278.77           P
ATOM  18994  P  B DA D   3     -10.119  47.855  19.481  0.50355.17           P
ATOM  19025  P  B DT D   4     -13.664  42.707  21.119  0.50237.06           P
ATOM  19056  P  B DG D   5     -19.510  39.821  21.770  0.50255.45           P
ATOM  19088  P  B DA D   6     -26.096  40.001  21.038  0.50437.49           P
ATOM  19120  P  B DC D   7     -31.790  41.189  18.413  0.50210.00           P
ATOM  19149  P  B DG D   8     -34.639  41.306  12.582  0.50313.99           P
ATOM  19179  P  B DA D   9     -34.987  38.244   6.813  0.50158.92           P
ATOM  19210  P  B DT D  10     -32.560  35.160   1.082  0.50181.38           P
HETATM19241  P  BTSP D  11     -27.614  30.137   0.455  0.50508.17           P
""")
    sequences, _ = iotbx.bioinformatics.fasta_sequence_parse.parse(
        """>4GFH:A|PDBID|CHAIN|SEQUENCE
MSTEPVSASDKYQKISQLEHILKRPDTYIGSVETQEQLQWIYDEETDCMIEKNVTIVPGLFKIFDEILVNAADNKVRDPS
MKRIDVNIHAEEHTIEVKNDGKGIPIEIHNKENIYIPEMIFGHLLTSSNYDDDEKKVTGGRNGYGAKLCNIFSTEFILET
ADLNVGQKYVQKWENNMSICHPPKITSYKKGPSYTKVTFKPDLTRFGMKELDNDILGVMRRRVYDINGSVRDINVYLNGK
SLKIRNFKNYVELYLKSLEKKRQLDNGEDGAAKSDIPTILYERINNRWEVAFAVSDISFQQISFVNSIATTMGGTHVNYI
TDQIVKKISEILKKKKKKSVKSFQIKNNMFIFINCLIENPAFTSQTKEQLTTRVKDFGSRCEIPLEYINKIMKTDLATRM
FEIADANEENALKKSDGTRKSRITNYPKLEDANKAGTKEGYKCTLVLTEGDSALSLAVAGLAVVGRDYYGCYPLRGKMLN
VREASADQILKNAEIQAIKKIMGLQHRKKYEDTKSLRYGHLMIMTDQDHDGSHIKGLIINFLESSFPGLLDIQGFLLEFI
TPIIKVSITKPTKNTIAFYNMPDYEKWREEESHKFTWKQKYYKGLGTSLAQEVREYFSNLDRHLKIFHSLQGNDKDYIDL
AFSKKKADDRKEWLRQYEPGTVLDPTLKEIPISDFINKELILFSLADNIRSIPNVLDGFKPGQRKVLYGCFKKNLKSELK
VAQLAPYVSECTAYHHGEQSLAQTIIGLAQNFVGSNNIYLLLPNGAFGTRATGGKDAAAARYIYTELNKLTRKIFHPADD
PLYKYIQEDEKTVEPEWYLPILPMILVNGAEGIGTGWSTYIPPFNPLEIIKNIRHLMNDEELEQMHPWFRGWTGTIEEIE
PLRYRMYGRIEQIGDNVLEITELPARTWTSTIKEYLLLGLSGNDKIKPWIKDMEEQHDDNIKFIITLSPEEMAKTRKIGF
YERFKLISPISLMNMVAFDPHGKIKKYNSVNEILSEFYYVRLEYYQKRKDHMSERLQWEVEKYSFQVKFIKMIIEKELTV
TNKPRNAIIQELENLGFPRFNKEGKPYYGSPNDEIAEQINDVKGATSDEEDEESSHEDTENVINGPEELYGTYEYLLGMR
IWSLTKERYQKLLKQKQEKETELENLLKLSAKDIWNTDLKAFEVGYQEFLQRDAEAR
>4GFH:D|PDBID|CHAIN|SEQUENCE
GGATGACGATX
""")
    v = validation(
        pdb_hierarchy=pdb_in.construct_hierarchy(),
        sequences=sequences,
        log=null_out(),
        nproc=1,
    )
    out = StringIO()
    v.show(out=out)
    assert v.chains[0].n_missing == 0
    assert v.chains[0].n_missing_end == 0
    assert v.chains[0].n_missing_start == 0
    assert len(v.chains[0].alignment.matches()) == 11
    #
    pdb_in = iotbx.pdb.input(source_info=None,
                             lines="""\
ATOM      2  CA  GLY A   1       1.367   0.551   0.300  1.00  7.71           C
ATOM      6  CA  CYS A   2       2.782   3.785   1.683  1.00  5.18           C
ATOM     12  CA  CYS A   3      -0.375   5.128   3.282  1.00  5.21           C
ATOM     18  CA  SER A   4      -0.870   2.048   5.492  1.00  7.19           C
ATOM     25  CA  LEU A   5       2.786   2.056   6.642  1.00  6.78           C
ATOM     33  CA  PRO A   6       3.212   4.746   9.312  1.00  7.03           C
ATOM     40  CA  PRO A   7       6.870   5.690   8.552  1.00  7.97           C
ATOM     47  CA  CYS A   8       6.021   6.070   4.855  1.00  6.48           C
ATOM     53  CA  ALA A   9       2.812   8.041   5.452  1.00  7.15           C
ATOM     58  CA  LEU A  10       4.739  10.382   7.748  1.00  8.36           C
ATOM     66  CA  SER A  11       7.292  11.200   5.016  1.00  7.00           C
ATOM     73  CA  ASN A  12       4.649  11.435   2.264  1.00  5.40           C
ATOM     81  CA  PRO A  13       1.879  13.433   3.968  1.00  5.97           C
ATOM     88  CA  ASP A  14       0.485  15.371   0.986  1.00  7.70           C
ATOM     96  CA  TYR A  15       0.565  12.245  -1.180  1.00  6.55           C
ATOM    108  CA  CYS A  16      -1.466  10.260   1.363  1.00  7.32           C
ATOM    113  N   NH2 A  17      -2.612  12.308   2.058  1.00  8.11           N
""")
    seq = iotbx.bioinformatics.sequence("GCCSLPPCALSNPDYCX")
    # match last residue
    v = validation(
        pdb_hierarchy=pdb_in.construct_hierarchy(),
        sequences=[seq],
        log=null_out(),
        nproc=1,
    )
    out = StringIO()
    v.show(out=out)
    assert v.chains[0].n_missing == 0
    assert v.chains[0].n_missing_end == 0
    assert v.chains[0].n_missing_start == 0
    assert len(v.chains[0].alignment.matches()) == 17
    # ignore non-protein residue
    v = validation(pdb_hierarchy=pdb_in.construct_hierarchy(),
                   sequences=[seq],
                   log=null_out(),
                   nproc=1,
                   ignore_hetatm=True)
    out = StringIO()
    v.show(out=out)
    assert v.chains[0].n_missing == 1
    assert v.chains[0].n_missing_end == 1
    assert v.chains[0].n_missing_start == 0
    assert len(v.chains[0].alignment.matches()) == 17
    #
    pdb_in = iotbx.pdb.input(source_info=None,
                             lines="""\
ATOM   2518  CA  PRO C   3      23.450  -5.848  45.723  1.00 85.24           C
ATOM   2525  CA  GLY C   4      20.066  -4.416  44.815  1.00 79.25           C
ATOM   2529  CA  PHE C   5      19.408  -0.913  46.032  1.00 77.13           C
ATOM   2540  CA  GLY C   6      17.384  -1.466  49.208  1.00 83.44           C
ATOM   2544  CA  GLN C   7      17.316  -5.259  49.606  1.00 89.25           C
ATOM   2553  CA  GLY C   8      19.061  -6.829  52.657  1.00 90.67           C
""")
    sequences, _ = iotbx.bioinformatics.fasta_sequence_parse.parse(
        """>1JN5:A|PDBID|CHAIN|SEQUENCE
MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLVWNGNAVSGQESLSEFFEMLPSSEFQISVVDCQPV
HDEATPSQTTVLVVICGSVKFEGNKQRDFNQNFILTAQASPSNTVWKIASDCFRFQDWAS
>1JN5:B|PDBID|CHAIN|SEQUENCE
APPCKGSYFGTENLKSLVLHFLQQYYAIYDSGDRQGLLDAYHDGACCSLSIPFIPQNPARSSLAEYFKDSRNVKKLKDPT
LRFRLLKHTRLNVVAFLNELPKTQHDVNSFVVDISAQTSTLLCFSVNGVFKEVDGKSRDSLRAFTRTFIAVPASNSGLCI
VNDELFVRNASSEEIQRAFAMPAPTPSSSPVPTLSPEQQEMLQAFSTQSGMNLEWSQKCLQDNNWDYTRSAQAFTHLKAK
GEIPEVAFMK
>1JN5:C|PDBID|CHAIN|SEQUENCE
GQSPGFGQGGSV
""")
    v = validation(
        pdb_hierarchy=pdb_in.construct_hierarchy(),
        sequences=sequences,
        log=null_out(),
        nproc=1,
    )
    out = StringIO()
    v.show(out=out)
    assert v.chains[0].n_missing_start == 3
    assert v.chains[0].n_missing_end == 3
    assert v.chains[0].identity == 1.0
    assert v.chains[0].alignment.match_codes == 'iiimmmmmmiii'
    #
    pdb_in = iotbx.pdb.input(source_info=None,
                             lines="""\
ATOM      2  CA  ALA A   2      -8.453  57.214 -12.754  1.00 52.95           C
ATOM      7  CA  LEU A   3      -8.574  59.274  -9.471  1.00 24.33           C
ATOM     15  CA  ARG A   4     -12.178  60.092  -8.575  1.00 28.40           C
ATOM     26  CA  GLY A   5     -14.170  61.485  -5.667  1.00 26.54           C
ATOM     30  CA  THR A   6     -17.784  60.743  -4.783  1.00 31.78           C
ATOM     37  CA  VAL A   7     -19.080  64.405  -4.464  1.00 21.31           C
""")
    seq = iotbx.bioinformatics.sequence("XALRGTV")
    v = validation(
        pdb_hierarchy=pdb_in.construct_hierarchy(),
        sequences=[seq],
        log=null_out(),
        nproc=1,
    )
    out = StringIO()
    v.show(out=out)
    assert v.chains[0].n_missing_start == 1
    assert v.chains[0].n_missing_end == 0
    assert v.chains[0].identity == 1.0
    assert v.chains[0].alignment.match_codes == 'immmmmm'
    #
    pdb_in = iotbx.pdb.input(source_info=None,
                             lines="""\
ATOM   2171  CA  ASP I 355       5.591 -11.903   1.133  1.00 41.60           C
ATOM   2175  CA  PHE I 356       7.082  -8.454   0.828  1.00 39.82           C
ATOM   2186  CA  GLU I 357       5.814  -6.112  -1.877  1.00 41.12           C
ATOM   2195  CA  GLU I 358       8.623  -5.111  -4.219  1.00 42.70           C
ATOM   2199  CA  ILE I 359      10.346  -1.867  -3.363  1.00 43.32           C
ATOM   2207  CA  PRO I 360      11.658   0.659  -5.880  1.00 44.86           C
ATOM   2214  CA  GLU I 361      14.921  -0.125  -7.592  1.00 44.32           C
ATOM   2219  CA  GLU I 362      15.848   3.489  -6.866  1.00 44.27           C
HETATM 2224  CA  TYS I 363      16.482   2.005  -3.448  1.00 44.52           C
""")
    seq = iotbx.bioinformatics.sequence("NGDFEEIPEEYL")
    v = validation(
        pdb_hierarchy=pdb_in.construct_hierarchy(),
        sequences=[seq],
        log=null_out(),
        nproc=1,
    )
    out = StringIO()
    v.show(out=out)
    assert v.chains[0].n_missing_start == 2
    assert v.chains[0].n_missing_end == 1
    assert v.chains[0].identity == 1.0
    pdb_in = iotbx.pdb.input(source_info=None,
                             lines="""\
ATOM    450  CA  ASN A   1      37.242  41.665  44.160  1.00 35.89           C
ATOM    458  CA  GLY A   2      37.796  38.269  42.523  1.00 30.13           C
HETATM  463  CA AMSE A   3      35.878  39.005  39.326  0.54 22.83           C
HETATM  464  CA BMSE A   3      35.892  39.018  39.323  0.46 22.96           C
ATOM    478  CA  ILE A   4      37.580  38.048  36.061  1.00 22.00           C
ATOM    486  CA  SER A   5      37.593  40.843  33.476  1.00 18.73           C
ATOM    819  CA  ALA A   8      25.982  34.781  27.220  1.00 18.43           C
ATOM    824  CA  ALA A   9      23.292  32.475  28.614  1.00 19.60           C
HETATM  830  CA BMSE A  10      22.793  30.814  25.223  0.41 22.60           C
HETATM  831  CA CMSE A  10      22.801  30.850  25.208  0.59 22.54           C
ATOM    845  CA  GLU A  11      26.504  30.054  24.966  1.00 25.19           C
ATOM    854  CA  GLY A  12      25.907  28.394  28.320  1.00 38.88           C
""")
    seq = iotbx.bioinformatics.sequence("NGMISAAAAMEG")
    v = validation(
        pdb_hierarchy=pdb_in.construct_hierarchy(),
        sequences=[seq],
        log=null_out(),
        nproc=1,
    )
    out = StringIO()
    v.show(out=out)
    assert v.chains[0].alignment.a == 'NGMISXXAAMEG'
    assert v.chains[0].alignment.b == 'NGMISAAAAMEG'
    pdb_in = iotbx.pdb.input(source_info=None,
                             lines="""\
ATOM   4615  CA  ALA C   1       1.000   1.000   1.000  1.00 10.00
ATOM   4622  CA  ALA C   2       1.000   1.000   1.000  1.00 10.00
ATOM   4627  CA  ALA C   3       1.000   1.000   1.000  1.00 10.00
ATOM   4634  CA  ALA C   4       1.000   1.000   1.000  1.00 10.00
ATOM   4646  CA  ALA C   5       1.000   1.000   1.000  1.00 10.00
ATOM   4658  CA  ALA C   6       1.000   1.000   1.000  1.00 10.00
ATOM   4664  CA  ALA C   7       1.000   1.000   1.000  1.00 10.00
ATOM   4669  CA  ALA C   8       1.000   1.000   1.000  1.00 10.00
ATOM   4680  CA  ARG C   9       1.000   1.000   1.000  1.00 10.00
ATOM   4690  CA  GLY C  10       1.000   1.000   1.000  1.00 10.00
ATOM   4698  CA  PRO C  11       1.000   1.000   1.000  1.00 10.00
ATOM   4705  CA  LYS C  12       1.000   1.000   1.000  1.00 10.00
ATOM   4712  CA  TRP C  13       1.000   1.000   1.000  1.00 10.00
ATOM   4726  CA  GLU C  14       1.000   1.000   1.000  1.00 10.00
ATOM   4738  CA  SER C  15       1.000   1.000   1.000  1.00 10.00
ATOM   4744  CA  THR C  16       1.000   1.000   1.000  1.00 10.00
ATOM   4751  CA  GLY C  17       1.000   1.000   1.000  1.00 10.00
ATOM   4755  CA  TYR C  18       1.000   1.000   1.000  1.00 10.00
ATOM   4767  CA  PHE C  19       1.000   1.000   1.000  1.00 10.00
ATOM   4778  CA  ALA C  20       1.000   1.000   1.000  1.00 10.00
ATOM   4786  CA  ALA C  21       1.000   1.000   1.000  1.00 10.00
ATOM   4798  CA  TRP C  22       1.000   1.000   1.000  1.00 10.00
ATOM   4812  CA  GLY C  23       1.000   1.000   1.000  1.00 10.00
ATOM   4816  CA  GLN C  24       1.000   1.000   1.000  1.00 10.00
ATOM   4822  CA  GLY C  25       1.000   1.000   1.000  1.00 10.00
ATOM   4826  CA  THR C  26       1.000   1.000   1.000  1.00 10.00
ATOM   4833  CA  LEU C  27       1.000   1.000   1.000  1.00 10.00
ATOM   4841  CA  VAL C  28       1.000   1.000   1.000  1.00 10.00
ATOM   4848  CA  THR C  29       1.000   1.000   1.000  1.00 10.00
ATOM   4855  CA  VAL C  30       1.000   1.000   1.000  1.00 10.00
ATOM   4862  CA  SER C  31       1.000   1.000   1.000  1.00 10.00
ATOM   4868  CA  SER C  32       1.000   1.000   1.000  1.00 10.00
END
""")
    seq = iotbx.bioinformatics.sequence(
        "AAAAAAAARGKWESPAALLKKAAWCSGTLVTVSSASAPKWKSTSGCYFAAPWNKRALRVTVLQSS")
    v = validation(
        pdb_hierarchy=pdb_in.construct_hierarchy(),
        sequences=[seq],
        log=null_out(),
        nproc=1,
    )
    out = StringIO()
    v.show(out=out)

    # check that shortest matching sequence is chosen
    # example from 6H4N, chain a, and I
    sequences, _ = iotbx.bioinformatics.fasta_sequence_parse.parse("""\
>6H4N:a|PDBID|CHAIN|SEQUENCE
AAUUGAAGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAACGGUAACAGGAAGAA
GCUUGCUUCUUUGCUGACGAGUGGCGGACGGGUGAGUAAUGUCUGGGAAACUGCCUGAUGGAGGGGGAUAACUACUGGAA
ACGGUAGCUAAUACCGCAUAACGUCGCAAGACCAAAGAGGGGGACCUUCGGGCCUCUUGCCAUCGGAUGUGCCCAGAUGG
GAUUAGCUAGUAGGUGGGGUAACGGCUCACCUAGGCGACGAUCCCUAGCUGGUCUGAGAGGAUGACCAGCCACACUGGAA
CUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACAAUGGGCGCAAGCCUGAUGCAGCCAUGCC
GCGUGUAUGAAGAAGGCCUUCGGGUUGUAAAGUACUUUCAGCGGGGAGGAAGGGAGUAAAGUUAAUACCUUUGCUCAUUG
ACGUUACCCGCAGAAGAAGCACCGGCUAACUCCGUGCCAGCAGCCGCGGUAAUACGGAGGGUGCAAGCGUUAAUCGGAAU
UACUGGGCGUAAAGCGCACGCAGGCGGUUUGUUAAGUCAGAUGUGAAAUCCCCGGGCUCAACCUGGGAACUGCAUCUGAU
ACUGGCAAGCUUGAGUCUCGUAGAGGGGGGUAGAAUUCCAGGUGUAGCGGUGAAAUGCGUAGAGAUCUGGAGGAAUACCG
GUGGCGAAGGCGGCCCCCUGGACGAAGACUGACGCUCAGGUGCGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGU
AGUCCACGCCGUAAACGAUGUCGACUUGGAGGUUGUGCCCUUGAGGCGUGGCUUCCGGAGCUAACGCGUUAAGUCGACCG
CCUGGGGAGUACGGCCGCAAGGUUAAAACUCAAAUGAAUUGACGGGGGCCCGCACAAGCGGUGGAGCAUGUGGUUUAAUU
CGAUGCAACGCGAAGAACCUUACCUGGUCUUGACAUCCACGGAAGUUUUCAGAGAUGAGAAUGUGCCUUCGGGAACCGUG
AGACAGGUGCUGCAUGGCUGUCGUCAGCUCGUGUUGUGAAAUGUUGGGUUAAGUCCCGCAACGAGCGCAACCCUUAUCCU
UUGUUGCCAGCGGUCCGGCCGGGAACUCAAAGGAGACUGCCAGUGAUAAACUGGAGGAAGGUGGGGAUGACGUCAAGUCA
UCAUGGCCCUUACGACCAGGGCUACACACGUGCUACAAUGGCGCAUACAAAGAGAAGCGACCUCGCGAGAGCAAGCGGAC
CUCAUAAAGUGCGUCGUAGUCCGGAUUGGAGUCUGCAACUCGACUCCAUGAAGUCGGAAUCGCUAGUAAUCGUGGAUCAG
AAUGCCACGGUGAAUACGUUCCCGGGCCUUGUACACACCGCCCGUCACACCAUGGGAGUGGGUUGCAAAAGAAGUAGGUA
GCUUAACCUUCGGGAGGGCGCUUACCACUUUGUGAUUCAUGACUGGGGUGAAGUCGUAACAAGGUAACCGUAGGGGAACC
UGCGGUUGGAUCAC
>6H4N:I|PDBID|CHAIN|SEQUENCE
CUCCU
""")
    pdb_in = iotbx.pdb.input(source_info=None,
                             lines="""\
ATOM  95502  P     C I1536     211.989 143.717 147.208  1.00 16.47           P
ATOM  95503  OP1   C I1536     213.292 143.696 146.494  1.00 16.47           O
ATOM  95504  OP2   C I1536     211.250 144.996 147.359  1.00 16.47           O
ATOM  95505  O5'   C I1536     211.021 142.666 146.541  1.00 16.47           O
ATOM  95506  C5'   C I1536     211.671 141.536 146.021  1.00 16.47           C
ATOM  95507  C4'   C I1536     211.059 140.260 146.502  1.00 16.47           C
ATOM  95508  O4'   C I1536     209.764 140.432 147.128  1.00 16.47           O
ATOM  95509  C3'   C I1536     210.818 139.353 145.303  1.00 16.47           C
ATOM  95510  O3'   C I1536     211.011 137.993 145.604  1.00 16.47           O
ATOM  95511  C2'   C I1536     209.372 139.646 144.938  1.00 16.47           C
ATOM  95512  O2'   C I1536     208.735 138.572 144.276  1.00 16.47           O
ATOM  95513  C1'   C I1536     208.757 139.866 146.316  1.00 16.47           C
ATOM  95514  N1    C I1536     207.618 140.788 146.322  1.00 16.47           N
ATOM  95515  C2    C I1536     206.610 140.626 145.378  1.00 16.47           C
ATOM  95516  O2    C I1536     206.712 139.721 144.535  1.00 16.47           O
ATOM  95517  N3    C I1536     205.560 141.463 145.396  1.00 16.47           N
ATOM  95518  C4    C I1536     205.492 142.420 146.320  1.00 16.47           C
ATOM  95519  N4    C I1536     204.429 143.227 146.302  1.00 16.47           N
ATOM  95520  C5    C I1536     206.496 142.595 147.306  1.00 16.47           C
ATOM  95521  C6    C I1536     207.522 141.754 147.283  1.00 16.47           C
ATOM  95522  P     U I1537     212.458 137.366 145.505  1.00 11.96           P
ATOM  95523  OP1   U I1537     212.292 135.894 145.567  1.00 11.96           O
ATOM  95524  OP2   U I1537     213.344 138.045 146.479  1.00 11.96           O
ATOM  95525  O5'   U I1537     212.962 137.720 144.038  1.00 11.96           O
ATOM  95526  C5'   U I1537     214.363 137.934 143.772  1.00 11.96           C
ATOM  95527  C4'   U I1537     214.522 138.678 142.472  1.00 11.96           C
ATOM  95528  O4'   U I1537     213.714 137.951 141.515  1.00 11.96           O
ATOM  95529  C3'   U I1537     213.970 140.098 142.549  1.00 11.96           C
ATOM  95530  O3'   U I1537     214.924 141.159 142.799  1.00 11.96           O
ATOM  95531  C2'   U I1537     212.939 140.210 141.413  1.00 11.96           C
ATOM  95532  O2'   U I1537     212.980 141.292 140.508  1.00 11.96           O
ATOM  95533  C1'   U I1537     212.990 138.848 140.714  1.00 11.96           C
ATOM  95534  N1    U I1537     211.632 138.324 140.509  1.00 11.96           N
ATOM  95535  C2    U I1537     211.212 138.082 139.216  1.00 11.96           C
ATOM  95536  O2    U I1537     211.943 138.228 138.252  1.00 11.96           O
ATOM  95537  N3    U I1537     209.897 137.730 139.076  1.00 11.96           N
ATOM  95538  C4    U I1537     208.966 137.602 140.074  1.00 11.96           C
ATOM  95539  O4    U I1537     207.834 137.203 139.798  1.00 11.96           O
ATOM  95540  C5    U I1537     209.473 137.843 141.382  1.00 11.96           C
ATOM  95541  C6    U I1537     210.749 138.206 141.544  1.00 11.96           C
ATOM  95542  P     C I1538     216.031 141.722 141.738  1.00 11.10           P
ATOM  95543  OP1   C I1538     216.814 142.772 142.428  1.00 11.10           O
ATOM  95544  OP2   C I1538     215.385 142.057 140.453  1.00 11.10           O
ATOM  95545  O5'   C I1538     217.081 140.541 141.538  1.00 11.10           O
ATOM  95546  C5'   C I1538     218.494 140.848 141.429  1.00 11.10           C
ATOM  95547  C4'   C I1538     218.962 140.916 139.986  1.00 11.10           C
ATOM  95548  O4'   C I1538     218.034 140.280 139.091  1.00 11.10           O
ATOM  95549  C3'   C I1538     219.276 142.298 139.408  1.00 11.10           C
ATOM  95550  O3'   C I1538     220.629 142.126 139.044  1.00 11.10           O
ATOM  95551  C2'   C I1538     218.657 142.315 138.005  1.00 11.10           C
ATOM  95552  O2'   C I1538     219.358 142.774 136.857  1.00 11.10           O
ATOM  95553  C1'   C I1538     218.164 140.883 137.832  1.00 11.10           C
ATOM  95554  N1    C I1538     216.943 140.702 137.064  1.00 11.10           N
ATOM  95555  C2    C I1538     217.041 140.096 135.813  1.00 11.10           C
ATOM  95556  O2    C I1538     218.163 139.770 135.401  1.00 11.10           O
ATOM  95557  N3    C I1538     215.932 139.850 135.093  1.00 11.10           N
ATOM  95558  C4    C I1538     214.748 140.195 135.580  1.00 11.10           C
ATOM  95559  N4    C I1538     213.670 139.968 134.827  1.00 11.10           N
ATOM  95560  C5    C I1538     214.617 140.827 136.842  1.00 11.10           C
ATOM  95561  C6    C I1538     215.722 141.024 137.566  1.00 11.10           C
ATOM  95562  P     C I1539     221.798 142.624 139.940  1.00 17.77           P
ATOM  95563  OP1   C I1539     221.300 143.669 140.865  1.00 17.77           O
ATOM  95564  OP2   C I1539     222.961 142.899 139.061  1.00 17.77           O
ATOM  95565  O5'   C I1539     222.148 141.341 140.812  1.00 17.77           O
ATOM  95566  C5'   C I1539     223.493 140.934 140.997  1.00 17.77           C
ATOM  95567  C4'   C I1539     223.633 139.444 140.845  1.00 17.77           C
ATOM  95568  O4'   C I1539     222.661 138.972 139.877  1.00 17.77           O
ATOM  95569  C3'   C I1539     224.967 138.959 140.300  1.00 17.77           C
ATOM  95570  O3'   C I1539     225.970 138.853 141.295  1.00 17.77           O
ATOM  95571  C2'   C I1539     224.602 137.629 139.658  1.00 17.77           C
ATOM  95572  O2'   C I1539     224.482 136.616 140.642  1.00 17.77           O
ATOM  95573  C1'   C I1539     223.209 137.924 139.109  1.00 17.77           C
ATOM  95574  N1    C I1539     223.219 138.333 137.681  1.00 17.77           N
ATOM  95575  C2    C I1539     223.353 137.370 136.683  1.00 17.77           C
ATOM  95576  O2    C I1539     223.476 136.178 136.982  1.00 17.77           O
ATOM  95577  N3    C I1539     223.342 137.742 135.392  1.00 17.77           N
ATOM  95578  C4    C I1539     223.202 139.017 135.059  1.00 17.77           C
ATOM  95579  N4    C I1539     223.202 139.332 133.762  1.00 17.77           N
ATOM  95580  C5    C I1539     223.059 140.033 136.041  1.00 17.77           C
ATOM  95581  C6    C I1539     223.067 139.642 137.318  1.00 17.77           C
ATOM  95582  P     U I1540     227.517 139.071 140.915  1.00 25.44           P
ATOM  95583  OP1   U I1540     228.321 138.910 142.156  1.00 25.44           O
ATOM  95584  OP2   U I1540     227.626 140.309 140.102  1.00 25.44           O
ATOM  95585  O5'   U I1540     227.868 137.833 139.978  1.00 25.44           O
ATOM  95586  C5'   U I1540     228.014 136.524 140.520  1.00 25.44           C
ATOM  95587  C4'   U I1540     228.308 135.503 139.447  1.00 25.44           C
ATOM  95588  O4'   U I1540     227.513 135.808 138.268  1.00 25.44           O
ATOM  95589  C3'   U I1540     229.761 135.445 138.980  1.00 25.44           C
ATOM  95590  O3'   U I1540     230.104 134.098 138.659  1.00 25.44           O
ATOM  95591  C2'   U I1540     229.740 136.281 137.705  1.00 25.44           C
ATOM  95592  O2'   U I1540     230.767 135.976 136.785  1.00 25.44           O
ATOM  95593  C1'   U I1540     228.360 135.950 137.145  1.00 25.44           C
ATOM  95594  N1    U I1540     227.809 136.996 136.268  1.00 25.44           N
ATOM  95595  C2    U I1540     227.053 136.589 135.186  1.00 25.44           C
ATOM  95596  O2    U I1540     226.815 135.418 134.956  1.00 25.44           O
ATOM  95597  N3    U I1540     226.574 137.600 134.393  1.00 25.44           N
ATOM  95598  C4    U I1540     226.781 138.951 134.566  1.00 25.44           C
ATOM  95599  O4    U I1540     226.286 139.746 133.765  1.00 25.44           O
ATOM  95600  C5    U I1540     227.583 139.293 135.701  1.00 25.44           C
ATOM  95601  C6    U I1540     228.061 138.329 136.493  1.00 25.44           C
END
""")
    v = validation(
        pdb_hierarchy=pdb_in.construct_hierarchy(),
        sequences=sequences,
        log=null_out(),
        nproc=1,
    )
    assert (v.chains[0].get_alignment() == ['CUCCU', 'CUCCU'])

    # all tests below here have additional dependencies
    if (not libtbx.env.has_module("ksdssp")):
        print("Skipping advanced tests (require ksdssp module)")
        return
    pdb_file = libtbx.env.find_in_repositories(
        relative_path="phenix_regression/pdb/1ywf.pdb", test=os.path.isfile)
    if (pdb_file is not None):
        seq = iotbx.bioinformatics.sequence(
            "MGSSHHHHHHSSGLVPRGSHMAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRRLGITDVADLRSSREVARRGPGRVPDGIDVHLLPFPDLADDDADDSAPHETAFKRLLTNDGSNGESGESSQSINDAATRYMTDEYRQFPTRNGAQRALHRVVTLLAAGRPVLTHCFAGKDRTGFVVALVLEAVGLDRDVIVADYLRSNDSVPQLRARISEMIQQRFDTELAPEVVTFTKARLSDGVLGVRAEYLAAARQTIDETYGSLGGYLRDAGISQATVNRMRGVLLG"
        )
        pdb_in = file_reader.any_file(pdb_file, force_type="pdb")
        hierarchy = pdb_in.file_object.hierarchy
        v = validation(pdb_hierarchy=hierarchy,
                       sequences=[seq],
                       log=null_out(),
                       nproc=1,
                       include_secondary_structure=True,
                       extract_coordinates=True)
        out = StringIO()
        v.show(out=out)
        aln1, aln2, ss = v.chains[0].get_alignment(include_sec_str=True)
        assert ("HHH" in ss) and ("LLL" in ss) and ("---" in ss)
        cif_block = v.sequence_as_cif_block()
        assert cif_block[
            '_struct_ref.pdbx_seq_one_letter_code'] == seq.sequence
        # assert list(
        #   cif_block['_struct_ref_seq.pdbx_auth_seq_align_beg']) == ['4', '117']
        # assert list(
        #   cif_block['_struct_ref_seq.pdbx_auth_seq_align_end']) == ['85', '275']
        # assert list(cif_block['_struct_ref_seq.seq_align_beg']) == ['1', '114']
        # assert list(cif_block['_struct_ref_seq.seq_align_end']) == ['82', '272']
        # determine relative counts of sequences and chains
        n_seq = get_sequence_n_copies(pdb_hierarchy=hierarchy,
                                      sequences=[seq] * 4,
                                      copies_from_xtriage=4,
                                      out=null_out())
        assert (n_seq == 1)
        hierarchy = hierarchy.deep_copy()
        chain2 = hierarchy.only_model().chains()[0].detached_copy()
        hierarchy.only_model().append_chain(chain2)
        n_seq = get_sequence_n_copies(pdb_hierarchy=hierarchy,
                                      sequences=[seq] * 4,
                                      copies_from_xtriage=2,
                                      out=null_out())
        assert (n_seq == 1)
        n_seq = get_sequence_n_copies(pdb_hierarchy=hierarchy,
                                      sequences=[seq],
                                      copies_from_xtriage=2,
                                      out=null_out())
        assert (n_seq == 4)
        try:
            n_seq = get_sequence_n_copies(pdb_hierarchy=hierarchy,
                                          sequences=[seq] * 3,
                                          copies_from_xtriage=2,
                                          out=null_out())
        except Sorry as s:
            assert ("round number" in str(s))
        else:
            raise Exception_expected
        n_seq = get_sequence_n_copies(pdb_hierarchy=hierarchy,
                                      sequences=[seq] * 3,
                                      copies_from_xtriage=2,
                                      force_accept_composition=True,
                                      out=null_out())
        assert (n_seq == 1)
        try:
            n_seq = get_sequence_n_copies(pdb_hierarchy=hierarchy,
                                          sequences=[seq] * 4,
                                          copies_from_xtriage=1,
                                          out=null_out())
        except Sorry as s:
            assert ("less than" in str(s))
        else:
            raise Exception_expected
        n_seq = get_sequence_n_copies(
            pdb_hierarchy=hierarchy,
            sequences=[seq] * 4,
            copies_from_xtriage=1,
            assume_xtriage_copies_from_sequence_file=True,
            out=null_out())
        assert (n_seq == 0.5)
        hierarchy = hierarchy.deep_copy()
        chain2 = hierarchy.only_model().chains()[0].detached_copy()
        hierarchy.only_model().append_chain(chain2)
        try:
            n_seq = get_sequence_n_copies(pdb_hierarchy=hierarchy,
                                          sequences=[seq] * 2,
                                          copies_from_xtriage=2,
                                          out=null_out())
        except Sorry as s:
            assert ("round number" in str(s))
        else:
            raise Exception_expected
        n_seq = get_sequence_n_copies(pdb_hierarchy=hierarchy,
                                      sequences=[seq],
                                      copies_from_xtriage=1,
                                      out=null_out())
        assert (n_seq == 3)
        hierarchy = hierarchy.deep_copy()
        chain2 = hierarchy.only_model().chains()[0].detached_copy()
        hierarchy.only_model().append_chain(chain2)
        n_seq = get_sequence_n_copies(pdb_hierarchy=hierarchy,
                                      sequences=[seq] * 2,
                                      copies_from_xtriage=2,
                                      out=null_out())
        assert (n_seq == 4)
        # now with files as input
        seq_file = "tmp_mmtbx_validation_sequence.fa"
        open(seq_file, "w").write(">1ywf\n%s" % seq.sequence)
        n_seq = get_sequence_n_copies_from_files(pdb_file=pdb_file,
                                                 seq_file=seq_file,
                                                 copies_from_xtriage=4,
                                                 out=null_out())
        try:
            assert (n_seq == 4)
        finally:
            os.remove(seq_file)
def exercise () :
  import libtbx.utils
  if (libtbx.utils.detect_multiprocessing_problem() is not None) :
    print "multiprocessing not available, skipping this test"
    return
  if (os.name == "nt"):
    print "easy_mp fixed_func not supported under Windows, skipping this test"
    return
  from mmtbx.validation.sequence import validation, get_sequence_n_copies, \
    get_sequence_n_copies_from_files
  import iotbx.bioinformatics
  import iotbx.pdb
  from iotbx import file_reader
  import libtbx.load_env # import dependency
  from libtbx.test_utils import Exception_expected, contains_lines, approx_equal
  from cStringIO import StringIO
  pdb_in = iotbx.pdb.input(source_info=None, lines="""\
ATOM      2  CA  ARG A  10      -6.299  36.344   7.806  1.00 55.20           C
ATOM     25  CA  TYR A  11      -3.391  33.962   7.211  1.00 40.56           C
ATOM     46  CA  ALA A  12      -0.693  34.802   4.693  1.00 67.95           C
ATOM     56  CA  ALA A  13       0.811  31.422   3.858  1.00 57.97           C
ATOM     66  CA  GLY A  14       4.466  31.094   2.905  1.00 49.24           C
ATOM     73  CA  ALA A  15       7.163  28.421   2.671  1.00 54.70           C
ATOM     83  CA  ILE A  16       6.554  24.685   2.957  1.00 51.79           C
ATOM    102  CA  LEU A  17       7.691  23.612   6.406  1.00 42.30           C
ATOM    121  CA  PTY A  18       7.292  19.882   5.861  1.00 36.68           C
ATOM    128  CA  PHE A  19       5.417  16.968   4.327  1.00 44.99           C
ATOM    148  CA  GLY A  20       3.466  14.289   6.150  1.00 41.99           C
ATOM    155  CA  GLY A  21       1.756  11.130   4.965  1.00 35.77           C
ATOM    190  CA  ALA A  24       1.294  19.658   3.683  1.00 47.02           C
ATOM    200  CA  VAL A  24A      2.361  22.009   6.464  1.00 37.13           C
ATOM    216  CA  HIS A  25       2.980  25.633   5.535  1.00 42.52           C
ATOM    234  CA  LEU A  26       4.518  28.425   7.577  1.00 47.63           C
ATOM    253  CA  ALA A  27       2.095  31.320   7.634  1.00 38.61           C
ATOM    263  CA  ARG A  28       1.589  34.719   9.165  1.00 37.04           C
END""")
  seq1 = iotbx.bioinformatics.sequence("MTTPSHLSDRYELGEILGFGGMSEVHLARD".lower())
  v = validation(
    pdb_hierarchy=pdb_in.construct_hierarchy(),
    sequences=[seq1],
    log=null_out(),
    nproc=1)
  out = StringIO()
  v.show(out=out)
  assert contains_lines(out.getvalue(), """\
  sequence identity: 76.47%
  13 residue(s) missing from PDB chain (9 at start, 1 at end)
  2 gap(s) in chain
  4 mismatches to sequence
    residue IDs:  12 13 15 24""")
  cif_block = v.as_cif_block()
  assert list(cif_block['_struct_ref.pdbx_seq_one_letter_code']) == [
    'MTTPSHLSDRYELGEILGFGGMSEVHLARD']
  assert approx_equal(cif_block['_struct_ref_seq.pdbx_auth_seq_align_beg'],
                      ['10', '14', '16', '19', '24'])
  assert approx_equal(cif_block['_struct_ref_seq.pdbx_auth_seq_align_end'],
                      ['11', '14', '17', '21', '28'])
  assert approx_equal(cif_block['_struct_ref_seq.db_align_beg'],
                      ['10', '14', '16', '19', '25'])
  assert approx_equal(cif_block['_struct_ref_seq.db_align_end'],
                      ['11', '14', '17', '21', '29'])
  assert cif_block['_struct_ref_seq.pdbx_seq_align_beg_ins_code'][4] == 'A'
  seq2 = iotbx.bioinformatics.sequence("MTTPSHLSDRYELGEILGFGGMSEVHLA")
  v = validation(
    pdb_hierarchy=pdb_in.construct_hierarchy(),
    sequences=[seq2],
    log=null_out(),
    nproc=1)
  out = StringIO()
  v.show(out=out)
  assert contains_lines(out.getvalue(), """\
  1 residues not found in sequence
    residue IDs:  28""")
  try :
    v = validation(
      pdb_hierarchy=pdb_in.construct_hierarchy(),
      sequences=[],
      log=null_out(),
      nproc=1)
  except AssertionError :
    pass
  else :
    raise Exception_expected
  cif_block = v.as_cif_block()
  assert list(cif_block['_struct_ref.pdbx_seq_one_letter_code']) == [
    'MTTPSHLSDRYELGEILGFGGMSEVHLA-']
  assert approx_equal(cif_block['_struct_ref_seq.pdbx_auth_seq_align_end'],
                      ['11', '14', '17', '21', '27'])
  assert approx_equal(cif_block['_struct_ref_seq.db_align_end'],
                      ['11', '14', '17', '21', '28'])
  #
  pdb_in2 = iotbx.pdb.input(source_info=None, lines="""\
ATOM      2  CA  ARG A  10      -6.299  36.344   7.806  1.00 55.20           C
ATOM     25  CA  TYR A  11      -3.391  33.962   7.211  1.00 40.56           C
ATOM     46  CA  ALA A  12      -0.693  34.802   4.693  1.00 67.95           C
ATOM     56  CA  ALA A  13       0.811  31.422   3.858  1.00 57.97           C
ATOM     66  CA  GLY A  14       4.466  31.094   2.905  1.00 49.24           C
ATOM     73  CA  ALA A  15       7.163  28.421   2.671  1.00 54.70           C
ATOM     83  CA  ILE A  16       6.554  24.685   2.957  1.00 51.79           C
ATOM    102  CA  LEU A  17       7.691  23.612   6.406  1.00 42.30           C
TER
ATOM   1936  P     G B   2     -22.947 -23.615  15.323  1.00123.20           P
ATOM   1959  P     C B   3     -26.398 -26.111  19.062  1.00110.06           P
ATOM   1979  P     U B   4     -29.512 -30.638  21.164  1.00101.06           P
ATOM   1999  P     C B   5     -30.524 -36.109  21.527  1.00 92.76           P
ATOM   2019  P     U B   6     -28.684 -41.458  21.223  1.00 87.42           P
ATOM   2062  P     G B   8     -18.396 -45.415  21.903  1.00 80.35           P
ATOM   2085  P     A B   9     -13.852 -43.272  24.156  1.00 77.76           P
ATOM   2107  P     G B  10      -8.285 -44.242  26.815  1.00 79.86           P
END
""")
  seq3 = iotbx.bioinformatics.sequence("AGCUUUGGAG")
  v = validation(
    pdb_hierarchy=pdb_in2.construct_hierarchy(),
    sequences=[seq2,seq3],
    log=null_out(),
    nproc=1,
    extract_coordinates=True)
  out = StringIO()
  v.show(out=out)
  cif_block = v.as_cif_block()
  assert approx_equal(cif_block['_struct_ref.pdbx_seq_one_letter_code'],
                      ['MTTPSHLSDRYELGEILGFGGMSEVHLA', 'AGCUUUGGAG'])
  assert approx_equal(cif_block['_struct_ref_seq.pdbx_auth_seq_align_beg'],
                      ['10', '14', '16', '2', '6', '8'])
  assert approx_equal(cif_block['_struct_ref_seq.pdbx_auth_seq_align_end'],
                      ['11', '14', '17', '4', '6', '10'])
  assert (len(v.chains[0].get_outliers_table()) == 3)
  assert (len(v.get_table_data()) == 4)
  assert approx_equal(
    v.chains[0].get_mean_coordinate_for_alignment_range(11,11),
    (-0.693, 34.802, 4.693))
  assert approx_equal(
    v.chains[0].get_mean_coordinate_for_alignment_range(11,14),
    (2.93675, 31.43475, 3.53175))
  assert (v.chains[0].get_highlighted_residues() == [11,12,14])
  assert contains_lines(out.getvalue(), """\
  3 mismatches to sequence
    residue IDs:  12 13 15""")
  assert contains_lines(out.getvalue(), """\
  sequence identity: 87.50%
  2 residue(s) missing from PDB chain (1 at start, 0 at end)
  1 gap(s) in chain
  1 mismatches to sequence
    residue IDs:  5""")
  s = easy_pickle.dumps(v)
  seq4 = iotbx.bioinformatics.sequence("")
  try :
    v = validation(
      pdb_hierarchy=pdb_in2.construct_hierarchy(),
      sequences=[seq4],
      log=null_out(),
      nproc=1,
      extract_coordinates=True)
  except AssertionError :
    pass
  else :
    raise Exception_expected
  # check that nucleic acid chain doesn't get aligned against protein sequence
  pdb_in = iotbx.pdb.input(source_info=None, lines="""\
ATOM  18932  P  B DG D   1     -12.183  60.531  25.090  0.50364.79           P
ATOM  18963  P  B DG D   2      -9.738  55.258  20.689  0.50278.77           P
ATOM  18994  P  B DA D   3     -10.119  47.855  19.481  0.50355.17           P
ATOM  19025  P  B DT D   4     -13.664  42.707  21.119  0.50237.06           P
ATOM  19056  P  B DG D   5     -19.510  39.821  21.770  0.50255.45           P
ATOM  19088  P  B DA D   6     -26.096  40.001  21.038  0.50437.49           P
ATOM  19120  P  B DC D   7     -31.790  41.189  18.413  0.50210.00           P
ATOM  19149  P  B DG D   8     -34.639  41.306  12.582  0.50313.99           P
ATOM  19179  P  B DA D   9     -34.987  38.244   6.813  0.50158.92           P
ATOM  19210  P  B DT D  10     -32.560  35.160   1.082  0.50181.38           P
HETATM19241  P  BTSP D  11     -27.614  30.137   0.455  0.50508.17           P
""")
  sequences, _ = iotbx.bioinformatics.fasta_sequence_parse.parse(
    """>4GFH:A|PDBID|CHAIN|SEQUENCE
MSTEPVSASDKYQKISQLEHILKRPDTYIGSVETQEQLQWIYDEETDCMIEKNVTIVPGLFKIFDEILVNAADNKVRDPS
MKRIDVNIHAEEHTIEVKNDGKGIPIEIHNKENIYIPEMIFGHLLTSSNYDDDEKKVTGGRNGYGAKLCNIFSTEFILET
ADLNVGQKYVQKWENNMSICHPPKITSYKKGPSYTKVTFKPDLTRFGMKELDNDILGVMRRRVYDINGSVRDINVYLNGK
SLKIRNFKNYVELYLKSLEKKRQLDNGEDGAAKSDIPTILYERINNRWEVAFAVSDISFQQISFVNSIATTMGGTHVNYI
TDQIVKKISEILKKKKKKSVKSFQIKNNMFIFINCLIENPAFTSQTKEQLTTRVKDFGSRCEIPLEYINKIMKTDLATRM
FEIADANEENALKKSDGTRKSRITNYPKLEDANKAGTKEGYKCTLVLTEGDSALSLAVAGLAVVGRDYYGCYPLRGKMLN
VREASADQILKNAEIQAIKKIMGLQHRKKYEDTKSLRYGHLMIMTDQDHDGSHIKGLIINFLESSFPGLLDIQGFLLEFI
TPIIKVSITKPTKNTIAFYNMPDYEKWREEESHKFTWKQKYYKGLGTSLAQEVREYFSNLDRHLKIFHSLQGNDKDYIDL
AFSKKKADDRKEWLRQYEPGTVLDPTLKEIPISDFINKELILFSLADNIRSIPNVLDGFKPGQRKVLYGCFKKNLKSELK
VAQLAPYVSECTAYHHGEQSLAQTIIGLAQNFVGSNNIYLLLPNGAFGTRATGGKDAAAARYIYTELNKLTRKIFHPADD
PLYKYIQEDEKTVEPEWYLPILPMILVNGAEGIGTGWSTYIPPFNPLEIIKNIRHLMNDEELEQMHPWFRGWTGTIEEIE
PLRYRMYGRIEQIGDNVLEITELPARTWTSTIKEYLLLGLSGNDKIKPWIKDMEEQHDDNIKFIITLSPEEMAKTRKIGF
YERFKLISPISLMNMVAFDPHGKIKKYNSVNEILSEFYYVRLEYYQKRKDHMSERLQWEVEKYSFQVKFIKMIIEKELTV
TNKPRNAIIQELENLGFPRFNKEGKPYYGSPNDEIAEQINDVKGATSDEEDEESSHEDTENVINGPEELYGTYEYLLGMR
IWSLTKERYQKLLKQKQEKETELENLLKLSAKDIWNTDLKAFEVGYQEFLQRDAEAR
>4GFH:D|PDBID|CHAIN|SEQUENCE
GGATGACGATX
""")
  v = validation(
    pdb_hierarchy=pdb_in.construct_hierarchy(),
    sequences=sequences,
    log=null_out(),
    nproc=1,)
  out = StringIO()
  v.show(out=out)
  assert v.chains[0].n_missing == 0
  assert v.chains[0].n_missing_end == 0
  assert v.chains[0].n_missing_start == 0
  assert len(v.chains[0].alignment.matches()) == 11
  #
  pdb_in = iotbx.pdb.input(source_info=None, lines="""\
ATOM      2  CA  GLY A   1       1.367   0.551   0.300  1.00  7.71           C
ATOM      6  CA  CYS A   2       2.782   3.785   1.683  1.00  5.18           C
ATOM     12  CA  CYS A   3      -0.375   5.128   3.282  1.00  5.21           C
ATOM     18  CA  SER A   4      -0.870   2.048   5.492  1.00  7.19           C
ATOM     25  CA  LEU A   5       2.786   2.056   6.642  1.00  6.78           C
ATOM     33  CA  PRO A   6       3.212   4.746   9.312  1.00  7.03           C
ATOM     40  CA  PRO A   7       6.870   5.690   8.552  1.00  7.97           C
ATOM     47  CA  CYS A   8       6.021   6.070   4.855  1.00  6.48           C
ATOM     53  CA  ALA A   9       2.812   8.041   5.452  1.00  7.15           C
ATOM     58  CA  LEU A  10       4.739  10.382   7.748  1.00  8.36           C
ATOM     66  CA  SER A  11       7.292  11.200   5.016  1.00  7.00           C
ATOM     73  CA  ASN A  12       4.649  11.435   2.264  1.00  5.40           C
ATOM     81  CA  PRO A  13       1.879  13.433   3.968  1.00  5.97           C
ATOM     88  CA  ASP A  14       0.485  15.371   0.986  1.00  7.70           C
ATOM     96  CA  TYR A  15       0.565  12.245  -1.180  1.00  6.55           C
ATOM    108  CA  CYS A  16      -1.466  10.260   1.363  1.00  7.32           C
ATOM    113  N   NH2 A  17      -2.612  12.308   2.058  1.00  8.11           N
""")
  seq = iotbx.bioinformatics.sequence("GCCSLPPCALSNPDYCX")
  v = validation(
    pdb_hierarchy=pdb_in.construct_hierarchy(),
    sequences=[seq],
    log=null_out(),
    nproc=1,)
  out = StringIO()
  v.show(out=out)
  assert v.chains[0].n_missing == 0
  assert v.chains[0].n_missing_end == 0
  assert v.chains[0].n_missing_start == 0
  assert len(v.chains[0].alignment.matches()) == 17
  #
  pdb_in = iotbx.pdb.input(source_info=None, lines="""\
ATOM   2518  CA  PRO C   3      23.450  -5.848  45.723  1.00 85.24           C
ATOM   2525  CA  GLY C   4      20.066  -4.416  44.815  1.00 79.25           C
ATOM   2529  CA  PHE C   5      19.408  -0.913  46.032  1.00 77.13           C
ATOM   2540  CA  GLY C   6      17.384  -1.466  49.208  1.00 83.44           C
ATOM   2544  CA  GLN C   7      17.316  -5.259  49.606  1.00 89.25           C
ATOM   2553  CA  GLY C   8      19.061  -6.829  52.657  1.00 90.67           C
""")
  sequences, _ = iotbx.bioinformatics.fasta_sequence_parse.parse(
    """>1JN5:A|PDBID|CHAIN|SEQUENCE
MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLVWNGNAVSGQESLSEFFEMLPSSEFQISVVDCQPV
HDEATPSQTTVLVVICGSVKFEGNKQRDFNQNFILTAQASPSNTVWKIASDCFRFQDWAS
>1JN5:B|PDBID|CHAIN|SEQUENCE
APPCKGSYFGTENLKSLVLHFLQQYYAIYDSGDRQGLLDAYHDGACCSLSIPFIPQNPARSSLAEYFKDSRNVKKLKDPT
LRFRLLKHTRLNVVAFLNELPKTQHDVNSFVVDISAQTSTLLCFSVNGVFKEVDGKSRDSLRAFTRTFIAVPASNSGLCI
VNDELFVRNASSEEIQRAFAMPAPTPSSSPVPTLSPEQQEMLQAFSTQSGMNLEWSQKCLQDNNWDYTRSAQAFTHLKAK
GEIPEVAFMK
>1JN5:C|PDBID|CHAIN|SEQUENCE
GQSPGFGQGGSV
""")
  v = validation(
    pdb_hierarchy=pdb_in.construct_hierarchy(),
    sequences=sequences,
    log=null_out(),
    nproc=1,)
  out = StringIO()
  v.show(out=out)
  assert v.chains[0].n_missing_start == 3
  assert v.chains[0].n_missing_end == 3
  assert v.chains[0].identity == 1.0
  assert v.chains[0].alignment.match_codes == 'iiimmmmmmiii'
  #
  pdb_in = iotbx.pdb.input(source_info=None, lines="""\
ATOM      2  CA  ALA A   2      -8.453  57.214 -12.754  1.00 52.95           C
ATOM      7  CA  LEU A   3      -8.574  59.274  -9.471  1.00 24.33           C
ATOM     15  CA  ARG A   4     -12.178  60.092  -8.575  1.00 28.40           C
ATOM     26  CA  GLY A   5     -14.170  61.485  -5.667  1.00 26.54           C
ATOM     30  CA  THR A   6     -17.784  60.743  -4.783  1.00 31.78           C
ATOM     37  CA  VAL A   7     -19.080  64.405  -4.464  1.00 21.31           C
""")
  seq = iotbx.bioinformatics.sequence("XALRGTV")
  v = validation(
    pdb_hierarchy=pdb_in.construct_hierarchy(),
    sequences=[seq],
    log=null_out(),
    nproc=1,)
  out = StringIO()
  v.show(out=out)
  assert v.chains[0].n_missing_start == 1
  assert v.chains[0].n_missing_end == 0
  assert v.chains[0].identity == 1.0
  assert v.chains[0].alignment.match_codes == 'immmmmm'
  #
  pdb_in = iotbx.pdb.input(source_info=None, lines="""\
ATOM   2171  CA  ASP I 355       5.591 -11.903   1.133  1.00 41.60           C
ATOM   2175  CA  PHE I 356       7.082  -8.454   0.828  1.00 39.82           C
ATOM   2186  CA  GLU I 357       5.814  -6.112  -1.877  1.00 41.12           C
ATOM   2195  CA  GLU I 358       8.623  -5.111  -4.219  1.00 42.70           C
ATOM   2199  CA  ILE I 359      10.346  -1.867  -3.363  1.00 43.32           C
ATOM   2207  CA  PRO I 360      11.658   0.659  -5.880  1.00 44.86           C
ATOM   2214  CA  GLU I 361      14.921  -0.125  -7.592  1.00 44.32           C
ATOM   2219  CA  GLU I 362      15.848   3.489  -6.866  1.00 44.27           C
HETATM 2224  CA  TYS I 363      16.482   2.005  -3.448  1.00 44.52           C
""")
  seq = iotbx.bioinformatics.sequence("NGDFEEIPEEYL")
  v = validation(
    pdb_hierarchy=pdb_in.construct_hierarchy(),
    sequences=[seq],
    log=null_out(),
    nproc=1,)
  out = StringIO()
  v.show(out=out)
  assert v.chains[0].n_missing_start == 2
  assert v.chains[0].n_missing_end == 1
  assert v.chains[0].identity == 1.0
  pdb_in = iotbx.pdb.input(source_info=None, lines="""\
ATOM    450  CA  ASN A   1      37.242  41.665  44.160  1.00 35.89           C
ATOM    458  CA  GLY A   2      37.796  38.269  42.523  1.00 30.13           C
HETATM  463  CA AMSE A   3      35.878  39.005  39.326  0.54 22.83           C
HETATM  464  CA BMSE A   3      35.892  39.018  39.323  0.46 22.96           C
ATOM    478  CA  ILE A   4      37.580  38.048  36.061  1.00 22.00           C
ATOM    486  CA  SER A   5      37.593  40.843  33.476  1.00 18.73           C
ATOM    819  CA  ALA A   8      25.982  34.781  27.220  1.00 18.43           C
ATOM    824  CA  ALA A   9      23.292  32.475  28.614  1.00 19.60           C
HETATM  830  CA BMSE A  10      22.793  30.814  25.223  0.41 22.60           C
HETATM  831  CA CMSE A  10      22.801  30.850  25.208  0.59 22.54           C
ATOM    845  CA  GLU A  11      26.504  30.054  24.966  1.00 25.19           C
ATOM    854  CA  GLY A  12      25.907  28.394  28.320  1.00 38.88           C
""")
  seq = iotbx.bioinformatics.sequence("NGMISAAAAMEG")
  v = validation(
    pdb_hierarchy=pdb_in.construct_hierarchy(),
    sequences=[seq],
    log=null_out(),
    nproc=1,)
  out = StringIO()
  v.show(out=out)
  assert v.chains[0].alignment.a == 'NGMISXXAAMEG'
  assert v.chains[0].alignment.b == 'NGMISAAAAMEG'
  pdb_in = iotbx.pdb.input(source_info=None, lines="""\
ATOM   4615  CA  ALA C   1       1.000   1.000   1.000  1.00 10.00
ATOM   4622  CA  ALA C   2       1.000   1.000   1.000  1.00 10.00
ATOM   4627  CA  ALA C   3       1.000   1.000   1.000  1.00 10.00
ATOM   4634  CA  ALA C   4       1.000   1.000   1.000  1.00 10.00
ATOM   4646  CA  ALA C   5       1.000   1.000   1.000  1.00 10.00
ATOM   4658  CA  ALA C   6       1.000   1.000   1.000  1.00 10.00
ATOM   4664  CA  ALA C   7       1.000   1.000   1.000  1.00 10.00
ATOM   4669  CA  ALA C   8       1.000   1.000   1.000  1.00 10.00
ATOM   4680  CA  ARG C   9       1.000   1.000   1.000  1.00 10.00
ATOM   4690  CA  GLY C  10       1.000   1.000   1.000  1.00 10.00
ATOM   4698  CA  PRO C  11       1.000   1.000   1.000  1.00 10.00
ATOM   4705  CA  LYS C  12       1.000   1.000   1.000  1.00 10.00
ATOM   4712  CA  TRP C  13       1.000   1.000   1.000  1.00 10.00
ATOM   4726  CA  GLU C  14       1.000   1.000   1.000  1.00 10.00
ATOM   4738  CA  SER C  15       1.000   1.000   1.000  1.00 10.00
ATOM   4744  CA  THR C  16       1.000   1.000   1.000  1.00 10.00
ATOM   4751  CA  GLY C  17       1.000   1.000   1.000  1.00 10.00
ATOM   4755  CA  TYR C  18       1.000   1.000   1.000  1.00 10.00
ATOM   4767  CA  PHE C  19       1.000   1.000   1.000  1.00 10.00
ATOM   4778  CA  ALA C  20       1.000   1.000   1.000  1.00 10.00
ATOM   4786  CA  ALA C  21       1.000   1.000   1.000  1.00 10.00
ATOM   4798  CA  TRP C  22       1.000   1.000   1.000  1.00 10.00
ATOM   4812  CA  GLY C  23       1.000   1.000   1.000  1.00 10.00
ATOM   4816  CA  GLN C  24       1.000   1.000   1.000  1.00 10.00
ATOM   4822  CA  GLY C  25       1.000   1.000   1.000  1.00 10.00
ATOM   4826  CA  THR C  26       1.000   1.000   1.000  1.00 10.00
ATOM   4833  CA  LEU C  27       1.000   1.000   1.000  1.00 10.00
ATOM   4841  CA  VAL C  28       1.000   1.000   1.000  1.00 10.00
ATOM   4848  CA  THR C  29       1.000   1.000   1.000  1.00 10.00
ATOM   4855  CA  VAL C  30       1.000   1.000   1.000  1.00 10.00
ATOM   4862  CA  SER C  31       1.000   1.000   1.000  1.00 10.00
ATOM   4868  CA  SER C  32       1.000   1.000   1.000  1.00 10.00
END
""")
  seq = iotbx.bioinformatics.sequence(
    "AAAAAAAARGKWESPAALLKKAAWCSGTLVTVSSASAPKWKSTSGCYFAAPWNKRALRVTVLQSS")
  v = validation(
    pdb_hierarchy=pdb_in.construct_hierarchy(),
    sequences=[seq],
    log=null_out(),
    nproc=1,)
  out = StringIO()
  v.show(out=out)
  # all tests below here have additional dependencies
  if (not libtbx.env.has_module("ksdssp")) :
    print "Skipping advanced tests (require ksdssp module)"
    return
  pdb_file = libtbx.env.find_in_repositories(
    relative_path="phenix_regression/pdb/1ywf.pdb",
    test=os.path.isfile)
  if (pdb_file is not None) :
    seq = iotbx.bioinformatics.sequence("MGSSHHHHHHSSGLVPRGSHMAVRELPGAWNFRDVADTATALRPGRLFRSSELSRLDDAGRATLRRLGITDVADLRSSREVARRGPGRVPDGIDVHLLPFPDLADDDADDSAPHETAFKRLLTNDGSNGESGESSQSINDAATRYMTDEYRQFPTRNGAQRALHRVVTLLAAGRPVLTHCFAGKDRTGFVVALVLEAVGLDRDVIVADYLRSNDSVPQLRARISEMIQQRFDTELAPEVVTFTKARLSDGVLGVRAEYLAAARQTIDETYGSLGGYLRDAGISQATVNRMRGVLLG")
    pdb_in = file_reader.any_file(pdb_file, force_type="pdb")
    hierarchy = pdb_in.file_object.hierarchy
    v = validation(
      pdb_hierarchy=hierarchy,
      sequences=[seq],
      log=null_out(),
      nproc=1,
      include_secondary_structure=True,
      extract_coordinates=True)
    out = StringIO()
    v.show(out=out)
    aln1, aln2, ss = v.chains[0].get_alignment(include_sec_str=True)
    assert ("HHH" in ss) and ("LLL" in ss) and ("---" in ss)
    cif_block = v.as_cif_block()
    assert cif_block['_struct_ref.pdbx_seq_one_letter_code'] == seq.sequence
    assert list(
      cif_block['_struct_ref_seq.pdbx_auth_seq_align_beg']) == ['4', '117']
    assert list(
      cif_block['_struct_ref_seq.pdbx_auth_seq_align_end']) == ['85', '275']
    assert list(cif_block['_struct_ref_seq.seq_align_beg']) == ['1', '114']
    assert list(cif_block['_struct_ref_seq.seq_align_end']) == ['82', '272']
    # determine relative counts of sequences and chains
    n_seq = get_sequence_n_copies(
      pdb_hierarchy=hierarchy,
      sequences=[seq] * 4,
      copies_from_xtriage=4,
      out=null_out())
    assert (n_seq == 1)
    hierarchy = hierarchy.deep_copy()
    chain2 = hierarchy.only_model().chains()[0].detached_copy()
    hierarchy.only_model().append_chain(chain2)
    n_seq = get_sequence_n_copies(
      pdb_hierarchy=hierarchy,
      sequences=[seq] * 4,
      copies_from_xtriage=2,
      out=null_out())
    assert (n_seq == 1)
    n_seq = get_sequence_n_copies(
      pdb_hierarchy=hierarchy,
      sequences=[seq],
      copies_from_xtriage=2,
      out=null_out())
    assert (n_seq == 4)
    try :
      n_seq = get_sequence_n_copies(
        pdb_hierarchy=hierarchy,
        sequences=[seq] * 3,
        copies_from_xtriage=2,
        out=null_out())
    except Sorry, s :
      assert ("round number" in str(s))
    else :
      raise Exception_expected
    n_seq = get_sequence_n_copies(
      pdb_hierarchy=hierarchy,
      sequences=[seq] * 3,
      copies_from_xtriage=2,
      force_accept_composition=True,
      out=null_out())
    assert (n_seq == 1)
    try :
      n_seq = get_sequence_n_copies(
        pdb_hierarchy=hierarchy,
        sequences=[seq] * 4,
        copies_from_xtriage=1,
        out=null_out())
    except Sorry, s :
      assert ("less than" in str(s))