def fps_to_arena(fps_reader, metadata=None, reorder=True, alignment=None): if metadata is None: metadata = fps_reader.metadata num_bits = metadata.num_bits if not num_bits: if metadata.num_bytes is None: raise ValueError("metadata must contain at least one of num_bits or num_bytes") num_bits = metadata.num_bytes * 8 #assert num_bits if alignment is None: alignment = get_optimal_alignment(num_bits) num_bytes = metadata.num_bytes storage_size = num_bytes if storage_size % alignment != 0: n = alignment - storage_size % alignment end_padding = "\0" * n storage_size += n else: end_padding = None ids = [] unsorted_fps = StringIO() for (id, fp) in fps_reader: if len(fp) != num_bytes: raise ValueError("Fingerprint for id %r has %d bytes while the metadata says it should have %d" % (id, len(fp), num_bytes)) unsorted_fps.write(fp) if end_padding: unsorted_fps.write(end_padding) ids.append(id) unsorted_arena = unsorted_fps.getvalue() unsorted_fps.close() unsorted_fps = None if not reorder or not metadata.num_bits: start_padding, end_padding, unsorted_arena = _chemfp.make_unsorted_aligned_arena( unsorted_arena, alignment) return FingerprintArena(metadata, alignment, start_padding, end_padding, storage_size, unsorted_arena, "", ids) # Reorder ordering = (ChemFPOrderedPopcount*len(ids))() popcounts = array.array("i", (0,)*(metadata.num_bits+2)) start_padding, end_padding, unsorted_arena = _chemfp.make_sorted_aligned_arena( num_bits, storage_size, unsorted_arena, len(ids), ordering, popcounts, alignment) new_ids = [ids[item.index] for item in ordering] return FingerprintArena(metadata, alignment, start_padding, end_padding, storage_size, unsorted_arena, popcounts.tostring(), new_ids)
def copy(self, indices=None, reorder=None): """Create a new arena using either all or some of the fingerprints in this arena By default this create a new arena. The fingerprint data block and ids may be shared with the original arena, which makes this a shallow copy. If the original arena is a slice, or "sub-arena" of an arena, then the copy will allocate new space to store just the fingerprints in the slice and use its own list for the ids. The `indices` parameter, if not None, is an iterable which contains the indicies of the fingerprint records to copy. Duplicates are allowed, though discouraged. If indices are specified then the default `reorder=None` or a `reorder=True` will reorder the fingerprints for the new arena by popcount. This improves overall search performance. With `reorder=False`, the fingerprints will be in order given by the indices. If indices are not given, then the default is to preserve the order type of the original arena. Otherwise `reorder=True` will always reorder and `reorder=False` will leave them in the current order. :param indices: indicies of the records to copy into the new arena :type indices: iterable containing integers, or None :param reorder: describes how to order the fingerprints :type reorder: True to reorder, False to leave in input order, None for default action """ if reorder is None: if indices is None: # This is a pure copy. Reorder only if there are popcount indices. reorder = (self.popcount_indices != "") else: # The default is to go fast. If you want to preserve index order # then you'll need to set reorder=False reorder = True if indices is None: # Make a completely new arena # Handle the trivial case where I don't need to do anything. if (self.start == 0 and (self.end*self.storage_size + self.start_padding + self.end_padding == len(self.arena)) and (not reorder or self.popcount_indices)): return FingerprintArena(self.metadata, self.alignment, self.start_padding, self.end_padding, self.storage_size, self.arena, self.popcount_indices, self.arena_ids, start = 0, end = self.end, id_lookup = self._id_lookup) # Otherwise I need to do some work # Make a copy of the actual fingerprints. (Which could be a subarena.) start = self.start_padding + self.start*self.storage_size end = self.start_padding + self.end*self.storage_size arena = self.arena[start:end] # If we don't have popcount_indices and don't want them ordered # then just do the alignment and we're done. if not reorder and not self.popcount_indices: # Don't reorder the unordered fingerprints start_padding, end_padding, unsorted_arena = ( _chemfp.make_unsorted_aligned_arena(arena, self.alignment)) return FingerprintArena(self.metadata, self.alignment, start_padding, end_padding, self.storage_size, unsorted_arena, "", self.ids, id_lookup = self._id_lookup) # Either we're already sorted or we should become sorted. # If we're sorted then make_sorted_aligned_arena will detect # that and keep the old arena. Otherwise it sorts first and # makes a new arena block. current_ids = self.ids ordering = (ChemFPOrderedPopcount*len(current_ids))() popcounts = array.array("i", (0,)*(self.metadata.num_bits+2)) start_padding, end_padding, arena = _chemfp.make_sorted_aligned_arena( self.metadata.num_bits, self.storage_size, arena, len(current_ids), ordering, popcounts, self.alignment) reordered_ids = [current_ids[item.index] for item in ordering] return FingerprintArena(self.metadata, self.alignment, start_padding, end_padding, self.storage_size, arena, popcounts.tostring(), reordered_ids) # On this pathway, we want to make a new arena which contains # selected fingerprints given indices into the old arena. arena = self.arena storage_size = self.storage_size start = self.start start_padding = self.start_padding arena_ids = self.arena_ids # First make sure that all of the indices are in range. # This will also convert negative indices into positive ones. new_indices = [] range_check = self._range_check try: for i in indices: new_indices.append(range_check[i]) except IndexError: raise IndexError("arena fingerprint index %d is out of range" % (i,)) if reorder and self.popcount_indices: # There's a slight performance benefit because # make_sorted_aligned_arena will see that the fingerprints # are already in sorted order and not resort. # XXX Is that true? Why do a Python sort instead of a C sort? # Perhaps because then I don't need to copy fingerprints? new_indices.sort() # Copy the fingerprints over to a new arena block unsorted_fps = [] new_ids = [] for new_i in new_indices: start_offset = start_padding + new_i*storage_size end_offset = start_offset + storage_size unsorted_fps.append(arena[start_offset:end_offset]) new_ids.append(arena_ids[new_i]) unsorted_arena = "".join(unsorted_fps) unsorted_fps = None # regain some memory # If the caller doesn't want ordered data, then leave it unsorted if not reorder: start_padding, end_padding, unsorted_arena = _chemfp.make_unsorted_aligned_arena( unsorted_arena, self.alignment) return FingerprintArena(self.metadata, self.alignment, start_padding, end_padding, storage_size, unsorted_arena, "", new_ids) # Otherwise, reorder and align the area, along with popcount information ordering = (ChemFPOrderedPopcount*len(new_ids))() popcounts = array.array("i", (0,)*(self.metadata.num_bits+2)) start_padding, end_padding, sorted_arena = _chemfp.make_sorted_aligned_arena( self.metadata.num_bits, storage_size, unsorted_arena, len(new_ids), ordering, popcounts, self.alignment) reordered_ids = [new_ids[item.index] for item in ordering] return FingerprintArena(self.metadata, self.alignment, start_padding, end_padding, storage_size, sorted_arena, popcounts.tostring(), reordered_ids)