Esempio n. 1
0
class cif(DictMixin):
    def __init__(self, blocks=None):
        if blocks is not None:
            self.blocks = OrderedDict(blocks)
        else:
            self.blocks = OrderedDict()
        self.keys_lower = dict([(key.lower(), key)
                                for key in self.blocks.keys()])

    def __setitem__(self, key, value):
        assert isinstance(value, block)
        if not re.match(tag_re, '_' + key):
            raise Sorry("%s is not a valid data block name" % key)
        self.blocks[key] = value
        self.keys_lower[key.lower()] = key

    def get(self, key, default=None):
        key_lower = self.keys_lower.get(key.lower())
        if (key_lower is None):
            return default
        return self.blocks.get(key_lower, default)

    def __getitem__(self, key):
        result = self.get(key)
        if (result is None):
            raise KeyError('Unknown CIF data block name: "%s"' % key)
        return result

    def __delitem__(self, key):
        del self.blocks[self.keys_lower[key.lower()]]
        del self.keys_lower[key.lower()]

    def keys(self):
        return self.blocks.keys()

    def __repr__(self):
        return repr(OrderedDict(self.iteritems()))

    def __copy__(self):
        return cif(self.blocks.copy())

    copy = __copy__

    def __deepcopy__(self, memo):
        return cif(copy.deepcopy(self.blocks, memo))

    def deepcopy(self):
        return copy.deepcopy(self)

    def show(self,
             out=None,
             indent="  ",
             indent_row=None,
             data_name_field_width=34,
             loop_format_strings=None,
             align_columns=True):
        if out is None:
            out = sys.stdout
        for name, block in self.items():
            print >> out, "data_%s" % name
            block.show(out=out,
                       indent=indent,
                       indent_row=indent_row,
                       data_name_field_width=data_name_field_width,
                       loop_format_strings=loop_format_strings,
                       align_columns=align_columns)

    def __str__(self):
        s = StringIO()
        self.show(out=s)
        return s.getvalue()

    def validate(self,
                 dictionary,
                 show_warnings=True,
                 error_handler=None,
                 out=None):
        if out is None: out = sys.stdout
        from iotbx.cif import validation
        errors = {}
        if error_handler is None:
            error_handler = validation.ErrorHandler()
        for key, block in self.blocks.iteritems():
            error_handler = error_handler.__class__()
            dictionary.set_error_handler(error_handler)
            block.validate(dictionary)
            errors.setdefault(key, error_handler)
            if error_handler.error_count or error_handler.warning_count:
                error_handler.show(show_warnings=show_warnings, out=out)
        return error_handler

    def sort(self, recursive=False, key=None, reverse=False):
        self.blocks = OrderedDict(
            sorted(self.blocks.items(), key=key, reverse=reverse))
        if recursive:
            for b in self.blocks.values():
                b.sort(recursive=recursive, reverse=reverse)
Esempio n. 2
0
class loop(DictMixin):
    def __init__(self, header=None, data=None):
        self._columns = OrderedDict()
        self.keys_lower = {}
        if header is not None:
            for key in header:
                self.setdefault(key, flex.std_string())
            if data is not None:
                # the number of data items must be an exact multiple of the number of headers
                assert len(data) % len(
                    header) == 0, "Wrong number of data items for loop"
                n_rows = len(data) // len(header)
                n_columns = len(header)
                for i in range(n_rows):
                    self.add_row(
                        [data[i * n_columns + j] for j in range(n_columns)])
        elif header is None and data is not None:
            assert isinstance(data, dict) or isinstance(data, OrderedDict)
            self.add_columns(data)
            self.keys_lower = dict([(key.lower(), key)
                                    for key in self._columns.keys()])

    def __setitem__(self, key, value):
        if not re.match(tag_re, key):
            raise Sorry("%s is not a valid data name" % key)
        if len(self) > 0:
            assert len(value) == self.size()
        if not isinstance(value, flex.std_string):
            for flex_numeric_type in (flex.int, flex.double):
                if isinstance(value, flex_numeric_type):
                    value = value.as_string()
                else:
                    try:
                        value = flex_numeric_type(value).as_string()
                    except TypeError:
                        continue
                    else:
                        break
            if not isinstance(value, flex.std_string):
                value = flex.std_string(value)
        # value must be a mutable type
        assert hasattr(value, '__setitem__')
        self._columns[key] = value
        self.keys_lower[key.lower()] = key

    def __getitem__(self, key):
        return self._columns[self.keys_lower[key.lower()]]

    def __delitem__(self, key):
        del self._columns[self.keys_lower[key.lower()]]
        del self.keys_lower[key.lower()]

    def keys(self):
        return self._columns.keys()

    def __repr__(self):
        return repr(OrderedDict(self.iteritems()))

    def name(self):
        return common_substring(self.keys()).rstrip('_').rstrip('.')

    def size(self):
        size = 0
        for column in self.values():
            size = max(size, len(column))
        return size

    def n_rows(self):
        return self.size()

    def n_columns(self):
        return len(self.keys())

    def add_row(self, row, default_value="?"):
        if isinstance(row, dict):
            for key in self:
                if key in row:
                    self[key].append(str(row[key]))
                else:
                    self[key].append(default_value)
        else:
            assert len(row) == len(self)
            for i, key in enumerate(self):
                self[key].append(str(row[i]))

    def add_column(self, key, values):
        if self.size() != 0:
            assert len(values) == self.size()
        self[key] = values
        self.keys_lower[key.lower()] = key

    def add_columns(self, columns):
        assert isinstance(columns, dict) or isinstance(columns, OrderedDict)
        for key, value in columns.iteritems():
            self.add_column(key, value)

    def update_column(self, key, values):
        assert type(key) == type(""), "first argument is column key string"
        if self.size() != 0:
            assert len(
                values) == self.size(), "len(values) %d != self.size() %d" % (
                    len(values),
                    self.size(),
                )
        self[key] = values
        self.keys_lower[key.lower()] = key

    def delete_row(self, index):
        assert index < self.n_rows()
        for column in self._columns.values():
            del column[index]

    def __copy__(self):
        new = loop()
        new._columns = self._columns.copy()
        new.keys_lower = self.keys_lower.copy()
        return new

    copy = __copy__

    def __deepcopy__(self, memo):
        new = loop()
        new._columns = copy.deepcopy(self._columns, memo)
        new.keys_lower = copy.deepcopy(self.keys_lower, memo)
        return new

    def deepcopy(self):
        return copy.deepcopy(self)

    def show(self,
             out=None,
             indent="  ",
             indent_row=None,
             fmt_str=None,
             align_columns=True):
        assert self.n_rows() > 0 and self.n_columns() > 0, "keys: %s %d %d" % (
            self.keys(),
            self.n_rows(),
            self.n_columns(),
        )
        if out is None:
            out = sys.stdout
        if indent_row is None:
            indent_row = indent
        assert indent.strip() == ""
        assert indent_row.strip() == ""
        print >> out, "loop_"
        for k in self.keys():
            print >> out, indent + k
        values = self._columns.values()
        range_len_values = range(len(values))
        if fmt_str is not None:
            # Pretty printing:
            #   The user is responsible for providing a valid format string.
            #   Values are not quoted - it is the user's responsibility to place
            #   appropriate quotes in the format string if a particular value may
            #   contain spaces.
            values = copy.deepcopy(values)
            for i, v in enumerate(values):
                for flex_numeric_type in (flex.int, flex.double):
                    if not isinstance(v, flex_numeric_type):
                        try:
                            values[i] = flex_numeric_type(v)
                        except ValueError:
                            continue
                        else:
                            break
            if fmt_str is None:
                fmt_str = indent_row + ' '.join(["%s"] * len(values))
            for i in range(self.size()):
                print >> out, fmt_str % tuple(
                    [values[j][i] for j in range_len_values])
        elif align_columns:
            fmt_str = []
            for i, (k, v) in enumerate(self.iteritems()):
                for i_v in range(v.size()):
                    v[i_v] = format_value(v[i_v])
                # exclude and semicolon text fields from column width calculation
                v_ = flex.std_string(item for item in v if "\n" not in item)
                width = v_.max_element_length()
                # See if column contains only number, '.' or '?'
                # right-align numerical columns, left-align everything else
                v = v.select(~((v == ".") | (v == "?")))
                try:
                    flex.double(v)
                except ValueError:
                    width *= -1
                fmt_str.append("%%%is" % width)
            fmt_str = indent_row + "  ".join(fmt_str)
            for i in range(self.size()):
                print >> out, (fmt_str %
                               tuple([values[j][i]
                                      for j in range_len_values])).rstrip()
        else:
            for i in range(self.size()):
                values_to_print = [
                    format_value(values[j][i]) for j in range_len_values
                ]
                print >> out, ' '.join([indent] + values_to_print)

    def __str__(self):
        s = StringIO()
        self.show(out=s)
        return s.getvalue()

    def iterrows(self):
        """ Warning! Still super-slow! """
        keys = self.keys()
        s_values = self.values()
        range_len_self = range(len(self))
        # range is 1% faster than xrange in this particular place.
        # tuple (s_values...) is slightly faster than list
        for j in range(self.size()):
            yield OrderedDict(
                zip(keys, (s_values[i][j] for i in range_len_self)))

    def find_row(self, kv_dict):
        self_keys = self.keys()
        for k in kv_dict.keys():
            assert k in self_keys
        result = []
        s_values = self.values()
        range_len_self = range(len(self))
        for i in range(self.size()):
            goodrow = True
            for k, v in kv_dict.iteritems():
                if self[k][i] != v:
                    goodrow = False
                    break
            if goodrow:
                result.append(
                    OrderedDict(
                        zip(self_keys,
                            [s_values[j][i] for j in range_len_self])))
        return result

    def sort(self, key=None, reverse=False):
        self._columns = OrderedDict(
            sorted(self._columns.items(), key=key, reverse=reverse))

    def order(self, order):
        def _cmp_key(k1, k2):
            for i, o in enumerate(order):
                if k1 == o: break
            for j, o in enumerate(order):
                if k2 == o: break
            if k1 < k2: return -1
            return 1

        keys = self._columns.keys()
        keys.sort(_cmp_key)
        tmp = OrderedDict()
        for o in order:
            tmp[o] = self._columns[o]
        self._columns = tmp

    def __eq__(self, other):
        if (len(self) != len(other) or self.size() != other.size()
                or self.keys() != other.keys()):
            return False
        for value, other_value in zip(self.values(), other.values()):
            if (value == other_value).count(True) != len(value):
                return False
        return True
Esempio n. 3
0
class miller_array_builder(crystal_symmetry_builder):

  observation_types = {
    '_refln_F_squared': xray.intensity(),
    '_refln_intensity': xray.intensity(),
    '_refln_F': xray.amplitude(),
    '_refln_A': None,
  }

  def __init__(self, cif_block, base_array_info=None, wavelengths=None):
    crystal_symmetry_builder.__init__(self, cif_block)
    if base_array_info is not None:
      self.crystal_symmetry = self.crystal_symmetry.join_symmetry(
        other_symmetry=base_array_info.crystal_symmetry_from_file,
      force=True)
    self._arrays = OrderedDict()
    if (wavelengths is None) :
      wavelengths = {}
    if base_array_info is None:
      base_array_info = miller.array_info(source_type="cif")
    refln_containing_loops = self.get_miller_indices_containing_loops()
    for self.indices, refln_loop in refln_containing_loops:
      self.wavelength_id_array = None
      self.crystal_id_array = None
      self.scale_group_array = None
      wavelength_ids = [None]
      crystal_ids = [None]
      scale_groups = [None]
      for key, value in refln_loop.iteritems():
        # need to get these arrays first
        if (key.endswith('wavelength_id') or
            key.endswith('crystal_id') or
            key.endswith('scale_group_code')):
          data = as_int_or_none_if_all_question_marks(value, column_name=key)
          if data is None:
            continue
          counts = data.counts()
          if key.endswith('wavelength_id'):
            wavelength_ids = counts.keys()
          if len(counts) == 1: continue
          array = miller.array(
            miller.set(self.crystal_symmetry, self.indices).auto_anomalous(), data)
          if key.endswith('wavelength_id'):
            self.wavelength_id_array = array
            wavelength_ids = counts.keys()
          elif key.endswith('crystal_id'):
            self.crystal_id_array = array
            crystal_ids = counts.keys()
          elif key.endswith('scale_group_code'):
            self.scale_group_array = array
            scale_groups = counts.keys()
      for label, value in sorted(refln_loop.items()):
        for w_id in wavelength_ids:
          for crys_id in crystal_ids:
            for scale_group in scale_groups:
              if 'index_' in label: continue
              key = label
              labels = [label]
              wavelength = None
              if (key.endswith('wavelength_id') or
                    key.endswith('crystal_id') or
                    key.endswith('scale_group_code')):
                w_id = None
                crys_id = None
                scale_group = None
              key_suffix = ''
              if w_id is not None:
                key_suffix += '_%i' %w_id
                labels.insert(0, "wavelength_id=%i" %w_id)
                wavelength = wavelengths.get(w_id, None)
              if crys_id is not None:
                key_suffix += '_%i' %crys_id
                labels.insert(0, "crystal_id=%i" %crys_id)
              if scale_group is not None:
                key_suffix += '_%i' %scale_group
                labels.insert(0, "scale_group_code=%i" %scale_group)
              key += key_suffix
              sigmas = None
              if key in self._arrays: continue
              array = self.flex_std_string_as_miller_array(
                value, wavelength_id=w_id, crystal_id=crys_id,
                scale_group_code=scale_group)
              if array is None: continue
              if '_sigma' in key:
                sigmas_label = label
                key = None
                for suffix in ('', '_meas', '_calc'):
                  if sigmas_label.replace('_sigma', suffix) in refln_loop:
                    key = sigmas_label.replace('_sigma', suffix) + key_suffix
                    break
                if key is None:
                  key = sigmas_label + key_suffix
                elif key in self._arrays and self._arrays[key].sigmas() is None:
                  sigmas = array
                  array = self._arrays[key]
                  check_array_sizes(array, sigmas, key, sigmas_label)
                  sigmas = as_flex_double(sigmas, sigmas_label)
                  array.set_sigmas(sigmas.data())
                  info = array.info()
                  array.set_info(
                    info.customized_copy(labels=info.labels+[sigmas_label],
                      wavelength=wavelength))
                  continue
              elif 'PHWT' in key:
                phwt_label = label
                fwt_label = label.replace('PHWT', 'FWT')
                if fwt_label not in refln_loop: continue
                phwt_array = array
                if fwt_label in self._arrays:
                  array = self._arrays[fwt_label]
                  check_array_sizes(array, phwt_array, fwt_label, phwt_label)
                  phases = as_flex_double(phwt_array, phwt_label)
                  info = array.info()
                  array = array.phase_transfer(phases, deg=True)
                  array.set_info(
                    info.customized_copy(labels=info.labels+[phwt_label]))
                  self._arrays[fwt_label] = array
                  continue
              elif 'HL_' in key:
                hl_letter = key[key.find('HL_')+3]
                hl_key = 'HL_' + hl_letter
                key = key.replace(hl_key, 'HL_A')
                if key in self._arrays:
                  continue # this array is already dealt with
                hl_labels = [label.replace(hl_key, 'HL_'+letter) for letter in 'ABCD']
                hl_keys = [key.replace(hl_key, 'HL_'+letter) for letter in 'ABCD']
                hl_values = [cif_block.get(hl_key) for hl_key in hl_labels]
                if hl_values.count(None) == 0:
                  selection = self.get_selection(
                    hl_values[0], wavelength_id=w_id,
                    crystal_id=crys_id, scale_group_code=scale_group)
                  hl_values = [as_double_or_none_if_all_question_marks(
                    hl.select(selection), column_name=lab)
                               for hl, lab in zip(hl_values, hl_labels)]
                  array = miller.array(miller.set(
                    self.crystal_symmetry, self.indices.select(selection)
                    ).auto_anomalous(), flex.hendrickson_lattman(*hl_values))
                  labels = labels[:-1]+hl_labels
              elif '.B_' in key or '_B_' in key:
                if '.B_' in key:
                  key, key_b = key.replace('.B_', '.A_'), key
                  label, label_b = label.replace('.B_', '.A_'), label
                elif '_B_' in key:
                  key, key_b = key.replace('_B', '_A'), key
                  label, label_b = label.replace('_B', '_A'), label
                if key in refln_loop and key_b in refln_loop:
                  b_part = array.data()
                  if key in self._arrays:
                    info = self._arrays[key].info()
                    a_part = self._arrays[key].data()
                    self._arrays[key] = self._arrays[key].array(
                      data=flex.complex_double(a_part, b_part))
                    self._arrays[key].set_info(
                      info.customized_copy(labels=info.labels+[key_b]))
                    continue
              elif ('phase_' in key and not "_meas" in key and
                    self.crystal_symmetry.space_group() is not None):
                alt_key1 = label.replace('phase_', 'F_')
                alt_key2 = alt_key1 + '_au'
                if alt_key1 in refln_loop:
                  phase_key = label
                  key = alt_key1+key_suffix
                elif alt_key2 in refln_loop:
                  phase_key = label
                  key = alt_key2+key_suffix
                else: phase_key = None
                if phase_key is not None:
                  phases = array.data()
                  if key in self._arrays:
                    array = self._arrays[key]
                    array = as_flex_double(array, key)
                    check_array_sizes(array, phases, key, phase_key)
                    info = self._arrays[key].info()
                    self._arrays[key] = array.phase_transfer(phases, deg=True)
                    self._arrays[key].set_info(
                      info.customized_copy(labels=info.labels+[phase_key]))
                  else:
                    array = self.flex_std_string_as_miller_array(
                      refln_loop[label], wavelength_id=w_id, crystal_id=crys_id,
                      scale_group_code=scale_group)
                    check_array_sizes(array, phases, key, phase_key)
                    array.phase_transfer(phases, deg=True)
                    labels = labels+[label, phase_key]
              if base_array_info.labels is not None:
                labels = base_array_info.labels + labels
              def rstrip_substrings(string, substrings):
                for substr in substrings:
                  if substr == '': continue
                  if string.endswith(substr):
                    string = string[:-len(substr)]
                return string
              # determine observation type
              stripped_key = rstrip_substrings(
                key, [key_suffix, '_au', '_meas', '_calc', '_plus', '_minus'])
              if (stripped_key.endswith('F_squared') or
                  stripped_key.endswith('intensity') or
                  stripped_key.endswith('.I') or
                  stripped_key.endswith('_I')) and (
                    array.is_real_array() or array.is_integer_array()):
                array.set_observation_type_xray_intensity()
              elif (stripped_key.endswith('F') and (
                array.is_real_array() or array.is_integer_array())):
                array.set_observation_type_xray_amplitude()
              if (array.is_xray_amplitude_array() or
                  array.is_xray_amplitude_array()):
                # e.g. merge_equivalents treats integer arrays differently, so must
                # convert integer observation arrays here to be safe
                if isinstance(array.data(), flex.int):
                  array = array.customized_copy(data=array.data().as_double())
              array.set_info(base_array_info.customized_copy(labels=labels))
              if (array.is_xray_amplitude_array() or
                  array.is_xray_amplitude_array()):
                info = array.info()
                array.set_info(info.customized_copy(wavelength=wavelength))
              self._arrays.setdefault(key, array)
    for key, array in self._arrays.copy().iteritems():
      if (   key.endswith('_minus') or '_minus_' in key
          or key.endswith('_plus') or '_plus_' in key):
        if '_minus' in key:
          minus_key = key
          plus_key = key.replace('_minus', '_plus')
        elif '_plus' in key:
          plus_key = key
          minus_key = key.replace('_plus', '_minus')
        if plus_key in self._arrays and minus_key in self._arrays:
          plus_array = self._arrays.pop(plus_key)
          minus_array = self._arrays.pop(minus_key)
          minus_array = minus_array.customized_copy(
            indices=-minus_array.indices()).set_info(minus_array.info())
          array = plus_array.concatenate(
            minus_array, assert_is_similar_symmetry=False)
          array = array.customized_copy(anomalous_flag=True)
          array.set_info(minus_array.info().customized_copy(
            labels=list(
              OrderedSet(plus_array.info().labels+minus_array.info().labels))))
          array.set_observation_type(plus_array.observation_type())
          self._arrays.setdefault(key, array)

    if len(self._arrays) == 0:
      raise CifBuilderError("No reflection data present in cif block")

  def get_miller_indices_containing_loops(self):
    loops = []
    for loop in self.cif_block.loops.values():
      for key in loop.keys():
        if 'index_h' not in key: continue
        hkl_str = [loop.get(key.replace('index_h', 'index_%s' %i)) for i in 'hkl']
        if hkl_str.count(None) > 0:
          raise CifBuilderError(
            "Miller indices missing from current CIF block (%s)"
            %key.replace('index_h', 'index_%s' %'hkl'[hkl_str.index(None)]))
        hkl_int = []
        for i,h_str in enumerate(hkl_str):
          try:
            h_int = flex.int(h_str)
          except ValueError, e:
            raise CifBuilderError(
              "Invalid item for Miller index %s: %s" % ("HKL"[i], str(e)))
          hkl_int.append(h_int)
        indices = flex.miller_index(*hkl_int)
        loops.append((indices, loop))
        break
    return loops
Esempio n. 4
0
class loop(DictMixin):
  def __init__(self, header=None, data=None):
    self._columns = OrderedDict()
    self.keys_lower = {}
    if header is not None:
      for key in header:
        self.setdefault(key, flex.std_string())
      if data is not None:
        # the number of data items must be an exact multiple of the number of headers
        assert len(data) % len(header) == 0, "Wrong number of data items for loop"
        n_rows = len(data)//len(header)
        n_columns = len(header)
        for i in range(n_rows):
          self.add_row([data[i*n_columns+j] for j in range(n_columns)])
    elif header is None and data is not None:
      assert isinstance(data, dict) or isinstance(data, OrderedDict)
      self.add_columns(data)
      self.keys_lower = dict(
        [(key.lower(), key) for key in self._columns.keys()])

  def __setitem__(self, key, value):
    if not re.match(tag_re, key):
      raise Sorry("%s is not a valid data name" %key)
    if len(self) > 0:
      assert len(value) == self.size()
    if not isinstance(value, flex.std_string):
      for flex_numeric_type in (flex.int, flex.double):
        if isinstance(value, flex_numeric_type):
          value = value.as_string()
        else:
          try:
            value = flex_numeric_type(value).as_string()
          except TypeError:
            continue
          else:
            break
      if not isinstance(value, flex.std_string):
        value = flex.std_string(value)
    # value must be a mutable type
    assert hasattr(value, '__setitem__')
    self._columns[key] = value
    self.keys_lower[key.lower()] = key

  def __getitem__(self, key):
    return self._columns[self.keys_lower[key.lower()]]

  def __delitem__(self, key):
    del self._columns[self.keys_lower[key.lower()]]
    del self.keys_lower[key.lower()]

  def keys(self):
    return self._columns.keys()

  def __repr__(self):
    return repr(OrderedDict(self.iteritems()))

  def name(self):
    return common_substring(self.keys()).rstrip('_').rstrip('.')

  def size(self):
    size = 0
    for column in self.values():
      size = max(size, len(column))
    return size

  def n_rows(self):
    size = 0
    for column in self.values():
      size = max(size, len(column))
    return size

  def n_columns(self):
    return len(self.keys())

  def add_row(self, row, default_value="?"):
    if isinstance(row, dict):
      for key in self:
        if key in row:
          self[key].append(str(row[key]))
        else:
          self[key].append(default_value)
    else:
      assert len(row) == len(self)
      for i, key in enumerate(self):
        self[key].append(str(row[i]))

  def add_column(self, key, values):
    if self.size() != 0:
      assert len(values) == self.size()
    self[key] = values
    self.keys_lower[key.lower()] = key

  def add_columns(self, columns):
    assert isinstance(columns, dict) or isinstance(columns, OrderedDict)
    for key, value in columns.iteritems():
      self.add_column(key, value)

  def update_column(self, key, values):
    assert type(key)==type(""), "first argument is column key string"
    if self.size() != 0:
      assert len(values) == self.size(), "len(values) %d != self.size() %d" % (
        len(values),
        self.size(),
        )
    self[key] = values
    self.keys_lower[key.lower()] = key

  def delete_row(self, index):
    assert index < self.n_rows()
    for column in self._columns.values():
      del column[index]

  def __copy__(self):
    new = loop()
    new._columns = self._columns.copy()
    new.keys_lower = self.keys_lower.copy()
    return new

  copy = __copy__

  def __deepcopy__(self, memo):
    new = loop()
    new._columns = copy.deepcopy(self._columns, memo)
    new.keys_lower = copy.deepcopy(self.keys_lower, memo)
    return new

  def deepcopy(self):
    return copy.deepcopy(self)

  def show(self, out=None, indent="  ", indent_row=None, fmt_str=None, align_columns=True):
    assert self.n_rows() > 0 and self.n_columns() > 0, "keys: %s %d %d" % (
      self.keys(),
      self.n_rows(),
      self.n_columns(),
      )
    if out is None:
      out = sys.stdout
    if indent_row is None:
      indent_row = indent
    assert indent.strip() == ""
    assert indent_row.strip() == ""
    print >> out, "loop_"
    for k in self.keys():
      print >> out, indent + k
    values = self._columns.values()
    if fmt_str is not None:
      # Pretty printing:
      #   The user is responsible for providing a valid format string.
      #   Values are not quoted - it is the user's responsibility to place
      #   appropriate quotes in the format string if a particular value may
      #   contain spaces.
      values = copy.deepcopy(values)
      for i, v in enumerate(values):
        for flex_numeric_type in (flex.int, flex.double):
          if not isinstance(v, flex_numeric_type):
            try:
              values[i] = flex_numeric_type(v)
            except ValueError:
              continue
            else:
              break
      if fmt_str is None:
        fmt_str = indent_row + ' '.join(["%s"]*len(values))
      for i in range(self.size()):
        print >> out, fmt_str % tuple([values[j][i] for j in range(len(values))])
    elif align_columns:
      fmt_str = []
      for i, (k, v) in enumerate(self.iteritems()):
        for i_v in range(v.size()):
          v[i_v] = format_value(v[i_v])
        # exclude and semicolon text fields from column width calculation
        v_ = flex.std_string(item for item in v if "\n" not in item)
        width = v_.max_element_length()
        # See if column contains only number, '.' or '?'
        # right-align numerical columns, left-align everything else
        v = v.select(~( (v == ".") | (v == "?") ))
        try:
          flex.double(v)
        except ValueError:
          width *= -1
        fmt_str.append("%%%is" %width)
      fmt_str = indent_row + "  ".join(fmt_str)
      for i in range(self.size()):
        print >> out, (fmt_str %
                       tuple([values[j][i]
                              for j in range(len(values))])).rstrip()
    else:
      for i in range(self.size()):
        values_to_print = [format_value(values[j][i]) for j in range(len(values))]
        print >> out, ' '.join([indent] + values_to_print)

  def __str__(self):
    s = StringIO()
    self.show(out=s)
    return s.getvalue()

  def iterrows(self):
    keys = self.keys()
    for j in range(self.size()):
      yield OrderedDict(zip(keys, [self.values()[i][j] for i in range(len(self))]))

  def sort(self, key=None, reverse=False):
    self._columns = OrderedDict(
      sorted(self._columns.items(), key=key, reverse=reverse))

  def order(self, order):
    def _cmp_key(k1, k2):
      for i, o in enumerate(order):
        if k1==o: break
      for j, o in enumerate(order):
        if k2==o: break
      if k1<k2: return -1
      return 1
    keys = self._columns.keys()
    keys.sort(_cmp_key)
    tmp = OrderedDict()
    for o in order:
      tmp[o]=self._columns[o]
    self._columns = tmp

  def __eq__(self, other):
    if (len(self) != len(other) or
        self.size() != other.size() or
        self.keys() != other.keys()):
      return False
    for value, other_value in zip(self.values(), other.values()):
      if (value == other_value).count(True) != len(value):
        return False
    return True
Esempio n. 5
0
class cif(DictMixin):
  def __init__(self, blocks=None):
    if blocks is not None:
      self.blocks = OrderedDict(blocks)
    else:
      self.blocks = OrderedDict()
    self.keys_lower = dict([(key.lower(), key) for key in self.blocks.keys()])

  def __setitem__(self, key, value):
    assert isinstance(value, block)
    if not re.match(tag_re, '_'+key):
      raise Sorry("%s is not a valid data block name" %key)
    self.blocks[key] = value
    self.keys_lower[key.lower()] = key

  def get(self, key, default=None):
    key_lower = self.keys_lower.get(key.lower())
    if (key_lower is None):
      return default
    return self.blocks.get(key_lower, default)

  def __getitem__(self, key):
    result = self.get(key)
    if (result is None):
      raise KeyError('Unknown CIF data block name: "%s"' % key)
    return result

  def __delitem__(self, key):
    del self.blocks[self.keys_lower[key.lower()]]
    del self.keys_lower[key.lower()]

  def keys(self):
    return self.blocks.keys()

  def __repr__(self):
    return repr(OrderedDict(self.iteritems()))

  def __copy__(self):
    return cif(self.blocks.copy())

  copy = __copy__

  def __deepcopy__(self, memo):
    return cif(copy.deepcopy(self.blocks, memo))

  def deepcopy(self):
    return copy.deepcopy(self)

  def show(self, out=None, indent="  ", indent_row=None,
           data_name_field_width=34,
           loop_format_strings=None):
    if out is None:
      out = sys.stdout
    for name, block in self.items():
      print >> out, "data_%s" %name
      block.show(
        out=out, indent=indent, indent_row=indent_row,
        data_name_field_width=data_name_field_width,
        loop_format_strings=loop_format_strings)

  def __str__(self):
    s = StringIO()
    self.show(out=s)
    return s.getvalue()

  def validate(self, dictionary, show_warnings=True, error_handler=None, out=None):
    if out is None: out = sys.stdout
    from iotbx.cif import validation
    errors = {}
    if error_handler is None:
      error_handler = validation.ErrorHandler()
    for key, block in self.blocks.iteritems():
      error_handler = error_handler.__class__()
      dictionary.set_error_handler(error_handler)
      block.validate(dictionary)
      errors.setdefault(key, error_handler)
      if error_handler.error_count or error_handler.warning_count:
        error_handler.show(show_warnings=show_warnings, out=out)
    return error_handler

  def sort(self, recursive=False, key=None, reverse=False):
    self.blocks = OrderedDict(sorted(self.blocks.items(), key=key, reverse=reverse))
    if recursive:
      for b in self.blocks.values():
        b.sort(recursive=recursive, reverse=reverse)
Esempio n. 6
0
class miller_array_builder(crystal_symmetry_builder):

    observation_types = {
        '_refln_F_squared': xray.intensity(),
        '_refln_intensity': xray.intensity(),
        '_refln_F': xray.amplitude(),
        '_refln_A': None,
    }

    def __init__(self, cif_block, base_array_info=None):
        crystal_symmetry_builder.__init__(self, cif_block)
        if base_array_info is not None:
            self.crystal_symmetry = self.crystal_symmetry.join_symmetry(
                other_symmetry=base_array_info.crystal_symmetry_from_file,
                force=True)
        self._arrays = OrderedDict()
        if base_array_info is None:
            base_array_info = miller.array_info(source_type="cif")
        refln_containing_loops = self.get_miller_indices_containing_loops()
        for self.indices, refln_loop in refln_containing_loops:
            self.wavelength_id_array = None
            self.crystal_id_array = None
            self.scale_group_array = None
            wavelength_ids = [None]
            crystal_ids = [None]
            scale_groups = [None]
            for key, value in refln_loop.iteritems():
                # need to get these arrays first
                if (key.endswith('wavelength_id') or key.endswith('crystal_id')
                        or key.endswith('scale_group_code')):
                    data = as_int_or_none_if_all_question_marks(
                        value, column_name=key)
                    if data is None: continue
                    counts = data.counts()
                    if len(counts) == 1: continue
                    array = miller.array(
                        miller.set(self.crystal_symmetry,
                                   self.indices).auto_anomalous(), data)
                    if key.endswith('wavelength_id'):
                        self.wavelength_id_array = array
                        wavelength_ids = counts.keys()
                    elif key.endswith('crystal_id'):
                        self.crystal_id_array = array
                        crystal_ids = counts.keys()
                    elif key.endswith('scale_group_code'):
                        self.scale_group_array = array
                        scale_groups = counts.keys()
            for label, value in sorted(refln_loop.items()):
                for w_id in wavelength_ids:
                    for crys_id in crystal_ids:
                        for scale_group in scale_groups:
                            if 'index_' in label: continue
                            key = label
                            labels = [label]
                            if (key.endswith('wavelength_id')
                                    or key.endswith('crystal_id')
                                    or key.endswith('scale_group_code')):
                                w_id = None
                                crys_id = None
                                scale_group = None
                            key_suffix = ''
                            if w_id is not None:
                                key_suffix += '_%i' % w_id
                                labels.insert(0, "wavelength_id=%i" % w_id)
                            if crys_id is not None:
                                key_suffix += '_%i' % crys_id
                                labels.insert(0, "crystal_id=%i" % crys_id)
                            if scale_group is not None:
                                key_suffix += '_%i' % scale_group
                                labels.insert(
                                    0, "scale_group_code=%i" % scale_group)
                            key += key_suffix
                            sigmas = None
                            if key in self._arrays: continue
                            array = self.flex_std_string_as_miller_array(
                                value,
                                wavelength_id=w_id,
                                crystal_id=crys_id,
                                scale_group_code=scale_group)
                            if array is None: continue
                            if '_sigma' in key:
                                sigmas_label = label
                                key = None
                                for suffix in ('', '_meas', '_calc'):
                                    if sigmas_label.replace(
                                            '_sigma', suffix) in refln_loop:
                                        key = sigmas_label.replace(
                                            '_sigma', suffix) + key_suffix
                                        break
                                if key is None:
                                    key = sigmas_label + key_suffix
                                elif key in self._arrays and self._arrays[
                                        key].sigmas() is None:
                                    sigmas = array
                                    array = self._arrays[key]
                                    check_array_sizes(array, sigmas, key,
                                                      sigmas_label)
                                    sigmas = as_flex_double(
                                        sigmas, sigmas_label)
                                    array.set_sigmas(sigmas.data())
                                    info = array.info()
                                    array.set_info(
                                        info.customized_copy(
                                            labels=info.labels +
                                            [sigmas_label]))
                                    continue
                            elif 'PHWT' in key:
                                phwt_label = label
                                fwt_label = label.replace('PHWT', 'FWT')
                                if fwt_label not in refln_loop: continue
                                phwt_array = array
                                if fwt_label in self._arrays:
                                    array = self._arrays[fwt_label]
                                    check_array_sizes(array, phwt_array,
                                                      fwt_label, phwt_label)
                                    phases = as_flex_double(
                                        phwt_array, phwt_label)
                                    info = array.info()
                                    array = array.phase_transfer(phases,
                                                                 deg=True)
                                    array.set_info(
                                        info.customized_copy(
                                            labels=info.labels + [phwt_label]))
                                    self._arrays[fwt_label] = array
                                    continue
                            elif 'HL_' in key:
                                hl_letter = key[key.find('HL_') + 3]
                                hl_key = 'HL_' + hl_letter
                                key = key.replace(hl_key, 'HL_A')
                                if key in self._arrays:
                                    continue  # this array is already dealt with
                                hl_labels = [
                                    label.replace(hl_key, 'HL_' + letter)
                                    for letter in 'ABCD'
                                ]
                                hl_keys = [
                                    key.replace(hl_key, 'HL_' + letter)
                                    for letter in 'ABCD'
                                ]
                                hl_values = [
                                    cif_block.get(hl_key)
                                    for hl_key in hl_labels
                                ]
                                if hl_values.count(None) == 0:
                                    selection = self.get_selection(
                                        hl_values[0],
                                        wavelength_id=w_id,
                                        crystal_id=crys_id,
                                        scale_group_code=scale_group)
                                    hl_values = [
                                        as_double_or_none_if_all_question_marks(
                                            hl.select(selection),
                                            column_name=lab)
                                        for hl, lab in zip(
                                            hl_values, hl_labels)
                                    ]
                                    array = miller.array(
                                        miller.set(
                                            self.crystal_symmetry,
                                            self.indices.select(
                                                selection)).auto_anomalous(),
                                        flex.hendrickson_lattman(*hl_values))
                                    labels = labels[:-1] + hl_labels
                            elif '.B_' in key or '_B_' in key:
                                if '.B_' in key:
                                    key, key_b = key.replace('.B_', '.A_'), key
                                    label, label_b = label.replace(
                                        '.B_', '.A_'), label
                                elif '_B_' in key:
                                    key, key_b = key.replace('_B', '_A'), key
                                    label, label_b = label.replace('_B',
                                                                   '_A'), label
                                if key in refln_loop and key_b in refln_loop:
                                    b_part = array.data()
                                    if key in self._arrays:
                                        info = self._arrays[key].info()
                                        a_part = self._arrays[key].data()
                                        self._arrays[key] = self._arrays[
                                            key].array(
                                                data=flex.complex_double(
                                                    a_part, b_part))
                                        self._arrays[key].set_info(
                                            info.customized_copy(
                                                labels=info.labels + [key_b]))
                                        continue
                            elif ('phase_' in key and not key.endswith('_meas')
                                  and self.crystal_symmetry.space_group()
                                  is not None):
                                alt_key1 = label.replace('phase_', 'F_')
                                alt_key2 = alt_key1 + '_au'
                                if alt_key1 in refln_loop:
                                    phase_key = label
                                    key = alt_key1 + key_suffix
                                elif alt_key2 in refln_loop:
                                    phase_key = label
                                    key = alt_key2 + key_suffix
                                else:
                                    phase_key = None
                                if phase_key is not None:
                                    phases = array.data()
                                    if key in self._arrays:
                                        array = self._arrays[key]
                                        array = as_flex_double(array, key)
                                        check_array_sizes(
                                            array, phases, key, phase_key)
                                        info = self._arrays[key].info()
                                        self._arrays[
                                            key] = array.phase_transfer(
                                                phases, deg=True)
                                        self._arrays[key].set_info(
                                            info.customized_copy(
                                                labels=info.labels +
                                                [phase_key]))
                                    else:
                                        array = self.flex_std_string_as_miller_array(
                                            refln_loop[label],
                                            wavelength_id=w_id,
                                            crystal_id=crys_id,
                                            scale_group_code=scale_group)
                                        check_array_sizes(
                                            array, phases, key, phase_key)
                                        array.phase_transfer(phases, deg=True)
                                        labels = labels + [label, phase_key]
                            if base_array_info.labels is not None:
                                labels = base_array_info.labels + labels

                            def rstrip_substrings(string, substrings):
                                for substr in substrings:
                                    if substr == '': continue
                                    if string.endswith(substr):
                                        string = string[:-len(substr)]
                                return string

                            # determine observation type
                            stripped_key = rstrip_substrings(
                                key, [
                                    key_suffix, '_au', '_meas', '_calc',
                                    '_plus', '_minus'
                                ])
                            if (stripped_key.endswith('F_squared')
                                    or stripped_key.endswith('intensity')
                                    or stripped_key.endswith('.I')
                                    or stripped_key.endswith('_I')) and (
                                        array.is_real_array()
                                        or array.is_integer_array()):
                                array.set_observation_type_xray_intensity()
                            elif (stripped_key.endswith('F')
                                  and (array.is_real_array()
                                       or array.is_integer_array())):
                                array.set_observation_type_xray_amplitude()
                            if (array.is_xray_amplitude_array()
                                    or array.is_xray_amplitude_array()):
                                # e.g. merge_equivalents treats integer arrays differently, so must
                                # convert integer observation arrays here to be safe
                                if isinstance(array.data(), flex.int):
                                    array = array.customized_copy(
                                        data=array.data().as_double())
                            array.set_info(
                                base_array_info.customized_copy(labels=labels))
                            self._arrays.setdefault(key, array)
        for key, array in self._arrays.copy().iteritems():
            if (key.endswith('_minus') or '_minus_' in key
                    or key.endswith('_plus') or '_plus_' in key):
                if '_minus' in key:
                    minus_key = key
                    plus_key = key.replace('_minus', '_plus')
                elif '_plus' in key:
                    plus_key = key
                    minus_key = key.replace('_plus', '_minus')
                if plus_key in self._arrays and minus_key in self._arrays:
                    plus_array = self._arrays.pop(plus_key)
                    minus_array = self._arrays.pop(minus_key)
                    minus_array = minus_array.customized_copy(
                        indices=-minus_array.indices()).set_info(
                            minus_array.info())
                    array = plus_array.concatenate(
                        minus_array, assert_is_similar_symmetry=False)
                    array = array.customized_copy(anomalous_flag=True)
                    array.set_info(
                        minus_array.info().customized_copy(labels=list(
                            OrderedSet(plus_array.info().labels +
                                       minus_array.info().labels))))
                    array.set_observation_type(plus_array.observation_type())
                    self._arrays.setdefault(key, array)

        if len(self._arrays) == 0:
            raise CifBuilderError("No reflection data present in cif block")

    def get_miller_indices_containing_loops(self):
        loops = []
        for loop in self.cif_block.loops.values():
            for key in loop.keys():
                if 'index_h' not in key: continue
                hkl_str = [
                    loop.get(key.replace('index_h', 'index_%s' % i))
                    for i in 'hkl'
                ]
                if hkl_str.count(None) > 0:
                    raise CifBuilderError(
                        "Miller indices missing from current CIF block (%s)" %
                        key.replace('index_h',
                                    'index_%s' % 'hkl'[hkl_str.index(None)]))
                hkl_int = []
                for i, h_str in enumerate(hkl_str):
                    try:
                        h_int = flex.int(h_str)
                    except ValueError, e:
                        raise CifBuilderError(
                            "Invalid item for Miller index %s: %s" %
                            ("HKL"[i], str(e)))
                    hkl_int.append(h_int)
                indices = flex.miller_index(*hkl_int)
                loops.append((indices, loop))
                break
        return loops
Esempio n. 7
0
class miller_array_builder(crystal_symmetry_builder):
    # Changes to this class should pass regression tests:
    # cctbx_project\mmtbx\regression\tst_cif_as_mtz_wavelengths.py
    # cctbx_project\iotbx\cif\tests\tst_lex_parse_build.py
    # phenix_regression\cif_as_mtz\tst_cif_as_mtz.py

    observation_types = {
        # known types of column data to be tagged as either amplitudes or intensities as per
        # https://www.iucr.org/__data/iucr/cifdic_html/2/cif_mm.dic/index.html
        '_refln.F_squared': xray.intensity(),
        '_refln_F_squared': xray.intensity(),
        '_refln.intensity': xray.intensity(),
        '_refln.I(+)': xray.intensity(),
        '_refln.I(-)': xray.intensity(),
        '_refln.F_calc': xray.amplitude(),
        '_refln.F_meas': xray.amplitude(),
        '_refln.FP': xray.amplitude(),
        '_refln.F-obs': xray.amplitude(),
        '_refln.Fobs': xray.amplitude(),
        '_refln.F-calc': xray.amplitude(),
        '_refln.Fcalc': xray.amplitude(),
        '_refln.pdbx_F_': xray.amplitude(),
        '_refln.pdbx_I_': xray.intensity(),
        '_refln.pdbx_anom_difference': xray.amplitude(),
    }

    def guess_observationtype(self, labl):
        for okey in self.observation_types.keys():
            if labl.startswith(okey):
                return self.observation_types[okey]
        return None

    def __init__(self, cif_block, base_array_info=None, wavelengths=None):
        crystal_symmetry_builder.__init__(self, cif_block)
        self._arrays = OrderedDict()
        self._origarrays = OrderedDict(
        )  # used for presenting raw data tables in HKLviewer
        basearraylabels = []
        if base_array_info is not None:
            self.crystal_symmetry = self.crystal_symmetry.join_symmetry(
                other_symmetry=base_array_info.crystal_symmetry_from_file,
                force=True)
            if base_array_info.labels:
                basearraylabels = base_array_info.labels
        if (wavelengths is None):
            wavelengths = {}
        if base_array_info is None:
            base_array_info = miller.array_info(source_type="cif")
        refln_containing_loops = self.get_miller_indices_containing_loops()
        for self.indices, refln_loop in refln_containing_loops:
            self.wavelength_id_array = None
            self.crystal_id_array = None
            self.scale_group_array = None
            wavelength_ids = [None]
            crystal_ids = [None]
            scale_groups = [None]
            for key, value in six.iteritems(refln_loop):
                # Get wavelength_ids, crystal_id, scale_group_code columns for selecting data of other
                # columns in self.get_selection() used by self.flex_std_string_as_miller_array()
                if (key.endswith('wavelength_id') or key.endswith('crystal_id')
                        or key.endswith('scale_group_code')):
                    data = as_int_or_none_if_all_question_marks(
                        value, column_name=key)
                    if data is None:
                        continue
                    counts = data.counts()
                    if key.endswith('wavelength_id'):
                        wavelength_ids = list(counts.keys())
                    if len(counts) == 1: continue
                    array = miller.array(
                        miller.set(self.crystal_symmetry,
                                   self.indices).auto_anomalous(), data)
                    if key.endswith('wavelength_id'):
                        self.wavelength_id_array = array
                        wavelength_ids = list(counts.keys())
                    elif key.endswith('crystal_id'):
                        self.crystal_id_array = array
                        crystal_ids = list(counts.keys())
                    elif key.endswith('scale_group_code'):
                        self.scale_group_array = array
                        scale_groups = list(counts.keys())
            labelsuffix = []
            wavelbl = []
            cryslbl = []
            scalegrplbl = []
            self._origarrays["HKLs"] = self.indices
            alllabels = list(sorted(refln_loop.keys()))
            remaininglabls = alllabels[:]  # deep copy the list
            # Parse labels matching cif column conventions
            # https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/refln.html
            # and extract groups of labels or just single columns.
            # Groups corresponds to the map coefficients, phase and amplitudes,
            # amplitudes or intensities with sigmas and hendrickson-lattman columns.
            phaseamplabls, remaininglabls = self.get_phase_amplitude_labels(
                remaininglabls)
            mapcoefflabls, remaininglabls = self.get_mapcoefficient_labels(
                remaininglabls)
            HLcoefflabls, remaininglabls = self.get_HL_labels(remaininglabls)
            data_sig_obstype_labls, remaininglabls = self.get_FSigF_ISigI_labels(
                remaininglabls)
            for w_id in wavelength_ids:
                for crys_id in crystal_ids:
                    for scale_group in scale_groups:
                        # If reflection data files contain more than one crystal, wavelength or scalegroup
                        # then add their id(s) as a suffix to data labels computed below. Needed for avoiding
                        # ambuguity but avoid when not needed to make labels more human readable!
                        if (len(wavelength_ids) > 1
                                or len(wavelengths) > 1) and w_id is not None:
                            wavelbl = ["wavelength_id=%i" % w_id]
                        if len(crystal_ids) > 1 and crys_id is not None:
                            cryslbl = ["crystal_id=%i" % crys_id]
                        if len(scale_groups) > 1 and scale_group is not None:
                            scalegrplbl = ["scale_group_code=%i" % scale_group]
                        labelsuffix = scalegrplbl + cryslbl + wavelbl
                        jlablsufx = ""
                        if len(labelsuffix):
                            jlablsufx = "," + ",".join(labelsuffix)
                        for mapcoefflabl in mapcoefflabls:
                            A_array = refln_loop[mapcoefflabl[0]]
                            B_array = refln_loop[mapcoefflabl[1]]
                            # deselect any ? marks in the two arrays, assuming both A and B have the same ? marks
                            selection = self.get_selection(
                                A_array,
                                wavelength_id=w_id,
                                crystal_id=crys_id,
                                scale_group_code=scale_group)
                            A_array = A_array.select(selection)
                            B_array = B_array.select(selection)
                            # form the miller array with map coefficients
                            data = flex.complex_double(flex.double(A_array),
                                                       flex.double(B_array))
                            millarr = miller.array(
                                miller.set(self.crystal_symmetry,
                                           self.indices.select(
                                               selection)).auto_anomalous(),
                                data)
                            # millarr will be None for column data not matching w_id,crys_id,scale_group values
                            if millarr is None: continue
                            labl = basearraylabels + mapcoefflabl + labelsuffix
                            millarr.set_info(
                                base_array_info.customized_copy(
                                    labels=labl,
                                    wavelength=wavelengths.get(w_id, None)))
                            self._arrays[mapcoefflabl[0] + jlablsufx] = millarr
                        for phaseamplabl in phaseamplabls:
                            amplitudestrarray = refln_loop[phaseamplabl[0]]
                            phasestrarray = refln_loop[phaseamplabl[1]]
                            millarr = self.flex_std_string_as_miller_array(
                                amplitudestrarray,
                                wavelength_id=w_id,
                                crystal_id=crys_id,
                                scale_group_code=scale_group)
                            phasesmillarr = self.flex_std_string_as_miller_array(
                                phasestrarray,
                                wavelength_id=w_id,
                                crystal_id=crys_id,
                                scale_group_code=scale_group)
                            # millarr will be None for column data not matching w_id,crys_id,scale_group values
                            if millarr is None or phasesmillarr is None:
                                continue
                            phases = as_flex_double(phasesmillarr,
                                                    phaseamplabl[1])
                            millarr = millarr.phase_transfer(phases, deg=True)
                            labl = basearraylabels + phaseamplabl + labelsuffix
                            millarr.set_info(
                                base_array_info.customized_copy(
                                    labels=labl,
                                    wavelength=wavelengths.get(w_id, None)))
                            self._arrays[phaseamplabl[0] + jlablsufx] = millarr
                        for datlabl, siglabl, otype in data_sig_obstype_labls:
                            datastrarray = refln_loop[datlabl]
                            millarr = self.flex_std_string_as_miller_array(
                                datastrarray,
                                wavelength_id=w_id,
                                crystal_id=crys_id,
                                scale_group_code=scale_group)
                            # millarr will be None for column data not matching w_id,crys_id,scale_group values
                            if millarr is None: continue
                            millarr = as_flex_double(millarr, datlabl)
                            datsiglabl = [datlabl]
                            if siglabl:
                                sigmasstrarray = refln_loop[siglabl]
                                sigmas = self.flex_std_string_as_miller_array(
                                    sigmasstrarray,
                                    wavelength_id=w_id,
                                    crystal_id=crys_id,
                                    scale_group_code=scale_group)
                                sigmas = as_flex_double(sigmas, siglabl)
                                millarr.set_sigmas(sigmas.data())
                                datsiglabl = [datlabl, siglabl]
                            datsiglabl = basearraylabels + datsiglabl + labelsuffix
                            millarr.set_info(
                                base_array_info.customized_copy(
                                    labels=datsiglabl,
                                    wavelength=wavelengths.get(w_id, None)))
                            if otype is not None:
                                millarr.set_observation_type(otype)
                            self._arrays[datlabl + jlablsufx] = millarr
                        for hl_labels in HLcoefflabls:
                            hl_values = [
                                cif_block.get(hl_key) for hl_key in hl_labels
                            ]
                            if hl_values.count(None) == 0:
                                selection = self.get_selection(
                                    hl_values[0],
                                    wavelength_id=w_id,
                                    crystal_id=crys_id,
                                    scale_group_code=scale_group)
                                hl_values = [
                                    as_double_or_none_if_all_question_marks(
                                        hl.select(selection), column_name=lab)
                                    for hl, lab in zip(hl_values, hl_labels)
                                ]
                                # hl_values will be None for column data not matching w_id,crys_id,scale_group values
                                if hl_values == [None, None, None, None]:
                                    continue
                                millarr = miller.array(
                                    miller.set(
                                        self.crystal_symmetry,
                                        self.indices.select(
                                            selection)).auto_anomalous(),
                                    flex.hendrickson_lattman(*hl_values))
                                hlabels = basearraylabels + hl_labels + labelsuffix
                                millarr.set_info(
                                    base_array_info.customized_copy(
                                        labels=hlabels,
                                        wavelength=wavelengths.get(w_id,
                                                                   None)))
                                self._arrays[hl_labels[0] +
                                             jlablsufx] = millarr
                        # pick up remaining columns if any that weren't identified above
                        for label in alllabels:
                            if "index_" in label:
                                continue
                            datastrarray = refln_loop[label]
                            if label in remaininglabls:
                                labels = basearraylabels + [label
                                                            ] + labelsuffix
                                lablsufx = jlablsufx
                                millarr = self.flex_std_string_as_miller_array(
                                    datastrarray,
                                    wavelength_id=w_id,
                                    crystal_id=crys_id,
                                    scale_group_code=scale_group)
                                # millarr will be None for column data not matching w_id,crys_id,scale_group values
                                if (label.endswith(
                                        'wavelength_id'
                                ) or label.endswith(
                                        'crystal_id'
                                ) or  # get full array if any of these labels, not just subsets
                                        label.endswith('scale_group_code')):
                                    millarr = self.flex_std_string_as_miller_array(
                                        datastrarray,
                                        wavelength_id=None,
                                        crystal_id=None,
                                        scale_group_code=None)
                                    lablsufx = ""
                                    labels = basearraylabels + [label]
                                if millarr is None: continue
                                otype = self.guess_observationtype(label)
                                if otype is not None:
                                    millarr.set_observation_type(otype)
                                millarr.set_info(
                                    base_array_info.customized_copy(
                                        labels=labels,
                                        wavelength=wavelengths.get(w_id,
                                                                   None)))
                                self._arrays[label + lablsufx] = millarr
                            origarr = self.flex_std_string_as_miller_array(
                                datastrarray,
                                wavelength_id=w_id,
                                crystal_id=crys_id,
                                scale_group_code=scale_group)
                            newlabel = label.replace("_refln.", "")
                            newlabel2 = newlabel.replace("_refln_", "")
                            if origarr:  # want only genuine miller arrays
                                self._origarrays[newlabel2 +
                                                 jlablsufx] = origarr.data()
        # Convert any groups of I+,I-,SigI+,SigI- (or amplitudes) arrays into anomalous arrays
        # i.e. both friedel mates in the same array
        for key, array in six.iteritems(self._arrays.copy()):
            plus_key = ""
            if '_minus' in key:
                minus_key = key
                plus_key = key.replace('_minus', '_plus')
            elif '-' in key:
                minus_key = key
                plus_key = key.replace('-', '+')
            elif '_plus' in key:
                plus_key = key
                minus_key = key.replace('_plus', '_minus')
            elif '+' in key:
                plus_key = key
                minus_key = key.replace('+', '-')
            if plus_key in self._arrays and minus_key in self._arrays:
                plus_array = self._arrays.pop(plus_key)
                minus_array = self._arrays.pop(minus_key)
                minus_array = minus_array.customized_copy(
                    indices=-minus_array.indices()).set_info(
                        minus_array.info())
                array = plus_array.concatenate(
                    minus_array, assert_is_similar_symmetry=False)
                array = array.customized_copy(anomalous_flag=True)
                array.set_info(minus_array.info().customized_copy(labels=list(
                    OrderedSet(plus_array.info().labels +
                               minus_array.info().labels))))
                array.set_observation_type(plus_array.observation_type())
                self._arrays.setdefault(key, array)
        if len(self._arrays) == 0:
            raise CifBuilderError("No reflection data present in cif block")
        # Sort the ordered dictionary to resemble the order of columns in the cif file
        # This is to avoid any F_meas arrays accidentally being put adjacent to
        # pdbx_anom_difference arrays in the self._arrays OrderedDict. Otherwise these
        # arrays may unintentionally be combined into a reconstructed anomalous amplitude
        # array when saving as an mtz file due to a problem in the iotbx/mtz module.
        # See http://phenix-online.org/pipermail/cctbxbb/2021-March/002289.html
        arrlstord = []
        arrlst = list(self._arrays)
        for arr in arrlst:
            for i, k in enumerate(refln_loop.keys()):
                if arr.split(",")[0] == k:
                    arrlstord.append((arr, i))
        # arrlstord must have the same keys as in the self._arrays dictionary
        assert sorted(arrlst) == sorted([e[0] for e in arrlstord])
        sortarrlst = sorted(arrlstord, key=lambda arrord: arrord[1])
        self._ordarrays = OrderedDict()
        for sortkey, i in sortarrlst:
            self._ordarrays.setdefault(sortkey, self._arrays[sortkey])
        self._arrays = self._ordarrays

    def get_HL_labels(self, keys):
        lstkeys = list(keys)  # cast into list if not a list
        HLquads = []
        alllabels = " ".join(lstkeys)
        """ Hendrickson-Lattmann labels could look like: 'HLAM', 'HLBM', 'HLCM', 'HLDM'
    or like 'HLanomA', 'HLanomB', 'HLanomC', 'HLanomD'
    Use a regular expression to group them accordingly
    """
        allmatches = re.findall(r"(\S*(HL(\S*)[abcdABCD](\S*)))", alllabels)
        HLtagslst = list(set([(e[2], e[3]) for e in allmatches]))
        usedkeys = []
        for m in HLtagslst:
            hllist = []
            for hm in allmatches:
                if m == (hm[2], hm[3]):
                    hllist.append((hm[0], hm[1]))
            if len(hllist) == 4:
                HLquads.append([e[0] for e in hllist])
                for e in hllist:
                    usedkeys.append(e[0])
        remainingkeys = []
        for e in lstkeys:
            if e not in usedkeys:
                remainingkeys.append(e)
        return HLquads, remainingkeys

    def get_mapcoefficient_labels(self, keys):
        # extract map coeffficients labels from list of cif column labels
        # e.g. ( _refln.A_calc_au _refln.B_calc_au ) , ( _refln.A_calc _refln.B_calc )
        lstkeys = list(keys)  # cast into list if not a list
        remainingkeys = lstkeys[:]  # deep copy the list
        alllabels = " ".join(lstkeys)
        mapcoefflabels = []
        A_matches = re.findall(
            r"( (\s*_refln[\._]A_)(\S*) )", alllabels, re.VERBOSE
        )  # [('_refln.PHWT', '_refln.PH', 'WT'), ('_refln.PHDELWT', '_refln.PH', 'DELWT')]
        for label in lstkeys:
            for m in A_matches:
                Blabel = m[1].replace("A_", "B_") + m[2]
                if Blabel == label:
                    mapcoefflabels.append([m[0], label])
                    remainingkeys.remove(m[0])
                    remainingkeys.remove(label)
        return mapcoefflabels, remainingkeys

    def get_phase_amplitude_labels(self, keys):
        # extract phase and amplitudes labels from list of cif column labels
        # e.g. ( _refln.F_calc _refln.phase_calc ) , ( _refln.FC_ALL _refln.PHIC_ALL ), ( _refln.FWT _refln.PHWT )
        lstkeys = list(keys)  # cast into list if not a list
        remainingkeys = lstkeys[:]  # deep copy the list
        alllabels = " ".join(lstkeys)
        phase_amplitudelabels = []
        PHmatches = re.findall(
            r"((\S*PH)([^I]\S*))", alllabels
        )  # [('_refln.PHWT', '_refln.PH', 'WT'), ('_refln.PHDELWT', '_refln.PH', 'DELWT')]
        for label in lstkeys:
            for m in PHmatches:
                PFlabel = m[1].replace("PH", "F") + m[2]
                Flabel = m[1].replace("PH", "") + m[2]
                if Flabel == label or PFlabel == label:
                    phase_amplitudelabels.append([label, m[0]])
                    remainingkeys.remove(label)
                    remainingkeys.remove(m[0])
        alllabels = " ".join(remainingkeys)
        PHImatches = re.findall(
            r"((\S*PHI)(\S*))", alllabels
        )  # [('_refln.PHIC', '_refln.PHI', 'C'), ('_refln.PHIC_ALL', '_refln.PHI', 'C_ALL')]
        for label in lstkeys:
            for m in PHImatches:
                PFlabel = m[1].replace("PHI", "F") + m[2]
                Flabel = m[1].replace("PHI", "") + m[2]
                if Flabel == label or PFlabel == label:
                    phase_amplitudelabels.append([label, m[0]])
                    remainingkeys.remove(label)
                    remainingkeys.remove(m[0])
        alllabels = " ".join(remainingkeys)
        PHDELmatches = re.findall(
            r"(((\S*)PH)([^I]\S*(WT)))", alllabels
        )  # [('_refln.PHDELWT', '_refln.PH', '_refln.', 'DELWT', 'WT')]
        for label in lstkeys:
            for m in PHDELmatches:
                Flabel = m[2] + m[3].replace("WT", "FWT")
                if Flabel == label:
                    phase_amplitudelabels.append([label, m[0]])
                    remainingkeys.remove(label)
                    remainingkeys.remove(m[0])
        alllabels = " ".join(remainingkeys)
        phase_matches = re.findall(
            r"((\S*[\._])phase(\S*))",
            alllabels)  # [('_refln.phase_calc', '_refln.', '')]
        for label in lstkeys:
            for m in phase_matches:
                phaselabel = m[0]
                Flabl = m[1] + m[2]
                Flabel = m[1] + "F" + m[2]
                Faulabel = m[1] + "F" + m[2] + "_au"
                if Flabl in label or Flabel in label or Faulabel in label:  # in case of _refln.F_calc_au and _refln.phase_calc
                    if label in remainingkeys and m[
                            0] in remainingkeys:  # in case
                        if (Flabel + "_sigma_au") in remainingkeys or (
                                Flabel + "_sigma") in remainingkeys:
                            continue  # give priority to F_meas, F_meas_sigma or  F_meas_au, F_meas_sigma_au
                        phase_amplitudelabels.append([label, m[0]])
                        remainingkeys.remove(label)
                        remainingkeys.remove(m[0])
        return phase_amplitudelabels, remainingkeys

    def get_FSigF_ISigI_labels(self, keys):
        # extract amplitudea, sigmas or intensitiy, sigmas labels from list of cif column labels
        # e.g. ( _refln.F_meas_sigma_au _refln.F_meas), ( _refln.intensity_sigma _refln.intensity ) ,
        # ( _refln.pdbx_I_plus_sigma _refln.pdbx_I_plus )
        lstkeys = list(keys)  # cast into list if not a list
        remainingkeys = lstkeys[:]  # deep copy the list
        alllabels = " ".join(lstkeys)
        labelpairs = []
        sigma_matches = re.findall(
            r"((\S*[\._])SIG(\S*))",
            alllabels)  # catch label pairs like F(+),SIGF(+)
        for label in lstkeys:
            for m in sigma_matches:
                FIlabel = m[1] + m[2]
                if FIlabel == label:
                    labelpairs.append(
                        [label, m[0],
                         self.guess_observationtype(label)])
                    remainingkeys.remove(label)
                    remainingkeys.remove(m[0])
        alllabels = " ".join(remainingkeys)
        sigma_matches = re.findall(
            r"((\S*)_sigma(_*\S*))", alllabels
        )  # [('_refln.F_meas_sigma_au', '_refln.F_meas', '_au'), ('_refln.intensity_sigma', '_refln.intensity', ''), ('_refln.pdbx_I_plus_sigma', '_refln.pdbx_I_plus', '')]
        for label in lstkeys:
            for m in sigma_matches:
                FIlabel = m[1] + m[2]
                if FIlabel == label:
                    labelpairs.append(
                        [label, m[0],
                         self.guess_observationtype(label)])
                    remainingkeys.remove(label)
                    remainingkeys.remove(m[0])
        alllabels = " ".join(remainingkeys)
        # catch generic meas and sigma labels
        anymeas_matches = re.findall(r"((\S*)_meas(\S*))",
                                     alllabels) + re.findall(
                                         r"((\S*)_calc(\S*))", alllabels)
        anysigma_matches = re.findall(r"((\S*)_sigma(\S*))", alllabels)
        for mmatch in anymeas_matches:
            for smatch in anysigma_matches:
                if mmatch[1] == smatch[1] and mmatch[2] == smatch[2]:
                    remainingkeys.remove(mmatch[0])
                    if smatch[
                            0] in remainingkeys:  # in case of say F_squared_calc, F_squared_meas, F_squared_sigma all being present
                        remainingkeys.remove(smatch[0])
                        labelpairs.append([
                            mmatch[0], smatch[0],
                            self.guess_observationtype(mmatch[0])
                        ])
                    else:
                        labelpairs.append([
                            mmatch[0], None,
                            self.guess_observationtype(mmatch[0])
                        ])
        return labelpairs, remainingkeys

    def get_miller_indices_containing_loops(self):
        loops = []
        for loop in self.cif_block.loops.values():
            for key in loop.keys():
                if 'index_h' not in key: continue
                hkl_str = [
                    loop.get(key.replace('index_h', 'index_%s' % i))
                    for i in 'hkl'
                ]
                if hkl_str.count(None) > 0:
                    raise CifBuilderError(
                        "Miller indices missing from current CIF block (%s)" %
                        key.replace('index_h',
                                    'index_%s' % 'hkl'[hkl_str.index(None)]))
                hkl_int = []
                for i, h_str in enumerate(hkl_str):
                    try:
                        h_int = flex.int(h_str)
                    except ValueError as e:
                        raise CifBuilderError(
                            "Invalid item for Miller index %s: %s" %
                            ("HKL"[i], str(e)))
                    hkl_int.append(h_int)
                indices = flex.miller_index(*hkl_int)
                loops.append((indices, loop))
                break
        return loops

    def get_selection(self,
                      value,
                      wavelength_id=None,
                      crystal_id=None,
                      scale_group_code=None):
        selection = ~((value == '.') | (value == '?'))
        if self.wavelength_id_array is not None and wavelength_id is not None:
            selection &= (self.wavelength_id_array.data() == wavelength_id)
        if self.crystal_id_array is not None and crystal_id is not None:
            selection &= (self.crystal_id_array.data() == crystal_id)
        if self.scale_group_array is not None and scale_group_code is not None:
            selection &= (self.scale_group_array.data() == scale_group_code)
        return selection

    def flex_std_string_as_miller_array(self,
                                        value,
                                        wavelength_id=None,
                                        crystal_id=None,
                                        scale_group_code=None):
        # Create a miller_array object of only the data and indices matching the
        # wavelength_id, crystal_id and scale_group_code submitted or full array if these are None
        selection = self.get_selection(value,
                                       wavelength_id=wavelength_id,
                                       crystal_id=crystal_id,
                                       scale_group_code=scale_group_code)
        data = value.select(selection)
        #if not isinstance(data, flex.double):
        try:
            data = flex.int(data)
            indices = self.indices.select(selection)
        except ValueError:
            try:
                data = flex.double(data)
                indices = self.indices.select(selection)
            except ValueError:
                # if flex.std_string return all values including '.' and '?'
                data = value
                indices = self.indices
        if data.size() == 0: return None
        return miller.array(
            miller.set(self.crystal_symmetry, indices).auto_anomalous(), data)

    def arrays(self):
        return self._arrays

    def origarrays(self):
        """
    return dictionary of raw data found in cif file cast into flex.double arrays
    or just string arrays as a fall back.
    """
        return self._origarrays