def _add_feff_to_file(self,file,crystal_structure): file.write('DEBYE {0} {1} \n\n'.format(self.optical_spectrum_options['temperature'],self.optical_spectrum_options['debye temperature'])) scattering_atom = int(self.optical_spectrum_options['atom'])-1 sphere_radius = float(self.optical_spectrum_options['sphere radius']) single_cell_coord = crystal_structure.calc_absolute_coordinates() Z_scattering = int( single_cell_coord[scattering_atom,3] ) atoms = self._find_atoms_within_sphere(crystal_structure,sphere_radius,scattering_atom) species = SortedSet(atoms[:,3].astype('int')) file.write('POTENTIALS\n') file.write(' 0 {}\n'.format(Z_scattering)) for i,specie in enumerate(species): file.write(' {0} {1}\n'.format(i+1,specie)) file.write('\nATOMS\n') for atom in atoms: coords = atom[:3] Z = int(atom[3]) if np.linalg.norm(coords-single_cell_coord[scattering_atom,:3]) <1e-6: potential_number = 0 else: potential_number = species.index(Z)+1 in_list = [coords[0]*sst.bohr,coords[1]*sst.bohr,coords[2]*sst.bohr,potential_number] file.write(' {0:1.10f} {1:1.10f} {2:1.10f} {3}\n'.format(*in_list))
class SortedSetKey: def __init__(self): self.dict = dict() self.sorted_set = SortedSet(key=self.get_key) def __getitem__(self, item): return self.sorted_set[item] def __len__(self): return len(self.sorted_set) def __str__(self): return str(self.sorted_set) def get_key(self, value): return self.dict[value] def get_reversed_list(self, index, count): return self[-1 - index:-1 - index - count:-1] def values(self): for value in self.sorted_set: yield value def clear(self): self.sorted_set.clear() self.dict.clear() def destroy(self): self.sorted_set = None def index(self, value): return self.sorted_set.index(value) def pop(self, index=-1): return self.sorted_set.pop(index) def add(self, value, rank): if value in self.sorted_set: self.sorted_set.remove(value) self.dict[value] = rank self.sorted_set.add(value) def remove(self, value): self.sorted_set.remove(value) del self.dict[value] def update(self, value_list, rank_list): self.sorted_set.difference_update(value_list) for i, value in enumerate(value_list): self.dict[value] = rank_list[i] self.sorted_set.update(value_list)
def test_index(): temp = SortedSet(range(100), load=7) assert all(temp.index(val) == val for val in range(100))
class ShotSelector(pg.LayoutWidget): valueChanged = pyqtSignal() selectionChanged = pyqtSignal() def __init__(self, **kwargs): super().__init__(**kwargs) self.nshots = 1 self.setFixedHeight(100) self.current_idx_le = QLineEdit(self) self.current_idx_le.setMaximumWidth(30) self.current_idx_le.setValidator(QtGui.QIntValidator()) self.current_idx_le.setText(str(-1)) self.addWidget(self.current_idx_le) self.slider = QSlider(self) self.slider.setPageStep(1) self.slider.setOrientation(Qt.Horizontal) self.addWidget(self.slider, colspan=2) self.nextRow() self.addWidget(QLabel('index selector')) self.idx_select_le = QLineEdit(self) self.idx_select_le.setText(':') self.addWidget(self.idx_select_le) self.warning = QLabel() self.warning.setMargin(5) self.update_warning() self.addWidget(self.warning) self.update_nshots(self.nshots) self.idx_select_le.editingFinished.connect(self.update_selection) self.current_idx_le.editingFinished.connect(self.setSliderValue) self.slider.valueChanged.connect(self.setLabelValue) def update_nshots(self, nshots): self.nshots = nshots self.idx = np.arange(self.nshots) self.slider.setRange(0, self.nshots - 1) self.update_selection() self.setSliderValue() def update_warning(self, warning=''): if warning == '': self.warning.setStyleSheet("background-color: lightgreen") warning = 'all good' else: self.warning.setStyleSheet("background-color: red") self.warning.setText(warning) def update_selection(self): self.update_warning() slice_text = self.idx_select_le.text() slices = slice_text.split(',') self.idx_selected = SortedSet([]) for s in slices: try: scope = locals() select = eval('self.idx[' + s + ']', scope) if isinstance(select, np.ndarray): for idx in select: self.idx_selected.add(idx) else: self.idx_selected.add(select) except: self.update_warning('problem in selected indeces') return 0 self.slider.setRange(0, len(self.idx_selected) - 1) if int(self.current_idx_le.text() ) % self.nshots not in self.idx_selected: self.current_idx_le.setText(self.idx_selected[-1]) self.update_warning( 'last index not in selection <br> -> setting last selected') self.selectionChanged.emit() def setLabelValue(self, value): newval = self.idx_selected[value] if newval != self.get_current_index(): self.current_idx_le.setText(str(newval)) self.valueChanged.emit() def setSliderValue(self): self.update_warning() value = int(self.current_idx_le.text()) try: value_sl = self.idx_selected.index(value % len(self.idx)) self.slider.setValue(value_sl) except ValueError: self.update_warning('set index not in selection <br> ignore') def get_current_index(self): return int(self.current_idx_le.text()) % self.nshots def get_selected_indices(self): return (np.array(self.idx_selected), )
def test_index(): temp = SortedSet(range(100)) temp._reset(7) assert all(temp.index(val) == val for val in range(100))
class Selection(IMutableGSlice): def __init__( self, universe: slice, revealed: list = None, intervals: Iterator = None, _length: Optional[int] = None # For performance ): #assert isinstance(universe, slice) # Should universe even be visible/exist? #assert universe.start == 0 #assert isinstance(universe.stop, int) #assert universe.stop >= 1 # TODO Do we need this? self.universe = universe if intervals is None and revealed is None: self._intervals = self.revealed2sortedset([slice(0, universe.stop)]) elif intervals is not None: self._intervals = SortedSet(intervals) else: self._intervals = self.revealed2sortedset(revealed) self._revealed_count = _length if isinstance(_length, int) else Selection._compute_len(self._intervals) @staticmethod def revealed2sortedset(revealed: List[Union[tuple, slice]]) -> SortedSet: """ Converts a list of included pairs to a sorted set of integers in O(n), n = size of @slices. Every number from every slice is added to the sorted set, except 0. """ # 10, [] -> 10, [] # 10, [(0, 10)] -> 10, [10] # 10, [(0, 7)] -> 10, [7] # 10, [(7, 10)] -> 10, [7, 10] # 10, [(3, 7)] -> 10, [3, 7] # 10, [(0, 3), (7, 10)] -> 10, [3, 7, 10] # 10, [(0, 1), (2, 3), (4, 5), (6, 7), (8, 9)] -> 10, [1, 2, 3, 4, 5, 6, 7, 8, 9] try: #intervals = SortedSet(a for a, _ in revealed).union(b for _, b in revealed) intervals = SortedSet() for a, b in revealed: intervals.add(a) intervals.add(b) except TypeError: # slice intervals = SortedSet(sl.start for sl in revealed).union(sl.stop for sl in revealed) if 0 in intervals: intervals.remove(0) return intervals @staticmethod def sortedset2slices(sortedset: SortedSet) -> List[slice]: """ Converts a sorted set of integers to a list of included slices in O(n), n = size of @sortedset. If there is an even number of elements in @sortedset, the first slice is formed by the first and second numbers, the second slice is formed by the third and fourth numbers, and so on. If there is an odd number of elements in @sortedset, the pair consisting of the number 0 and the first element in @sortedset becomes the first slice in the output list. The remaining slices, if any, are formed by the second and third numbers, the fourth and fifth numbers, and so on. """ slices = [] if len(sortedset) % 2 == 0: for i in range(0, len(sortedset), 2): slices.append(slice(sortedset[i], sortedset[i + 1])) else: slices.append(slice(0, sortedset[0])) for i in range(1, len(sortedset), 2): slices.append(slice(sortedset[i], sortedset[i + 1])) return slices def slices(self) -> List[slice]: return self.sortedset2slices(self._intervals) def pairs(self) -> Iterator[Tuple[int, int]]: if len(self._intervals) % 2 == 0: return zip(self._intervals[::2], self._intervals[1::2]) return itertools.chain([(0, self._intervals[0])], zip(self._intervals[1::2], self._intervals[2::2])) def gap_pairs(self) -> Iterator[Tuple[int, int]]: return self.complement().pairs() def intervals(self): return self._intervals def exclude(self, from_index: Optional[int], to_index: Optional[int]): original_length = self._revealed_count if isinstance(from_index, int) and -self.universe.stop <= from_index < 0: from_index = from_index % self.universe.stop if isinstance(to_index, int): if to_index > self.universe.stop: return self.exclude(from_index, None) if -self.universe.stop <= to_index < 0: to_index = to_index % self.universe.stop assert from_index is None or self.universe.start <= from_index <= self.universe.stop assert to_index is None or self.universe.start <= to_index <= self.universe.stop if from_index is None: from_index = self.universe.start if to_index is None: to_index = self.universe.stop if len(self._intervals) == 0: return 0 if from_index >= to_index: return 0 m = self._intervals.bisect_right(from_index) n = self._intervals.bisect_right(to_index) try: from_index_index = self._intervals.index(from_index) except ValueError: from_index_index = None try: to_index_index = self._intervals.index(to_index) except ValueError: to_index_index = None from_index_is_included = ( len(self._intervals) % 2 == 0 and m % 2 == 1 or len(self._intervals) % 2 == 1 and m % 2 == 0) to_index_is_included = ( len(self._intervals) % 2 == 0 and n % 2 == 1 or len(self._intervals) % 2 == 1 and n % 2 == 0) from_index_is_leftmost_included = from_index == 0 and from_index_is_included or from_index_index is not None and ( len(self._intervals) % 2 == 0 and from_index_index % 2 == 0 or len(self._intervals) % 2 == 1 and (from_index == 0 or from_index_index % 2 == 1)) to_index_right_of_excluded = to_index_index is not None and ( len(self._intervals) % 2 == 0 and to_index_index % 2 == 1 or len(self._intervals) % 2 == 1 and (to_index == 0 or to_index_index % 2 == 0)) if from_index_is_included: if from_index_is_leftmost_included: if to_index_is_included: if m == 0: to_remove = self._intervals[m:n] endpoint = 0 if n == 0 else self._intervals[n - 1] addendum = 0 if n == 0 else self._intervals[0] self._revealed_count -= (to_index - endpoint) + addendum + sum( b - a for a, b in zip(to_remove[1:-1:2], to_remove[2:-1:2])) del self._intervals[m:n] self._intervals.add(to_index) else: intermediates = self._intervals[m + 1:n - 1] from_start, from_end = self._intervals[m - 1], self._intervals[m] to_start, to_end = self._intervals[n - 1], self._intervals[n] if m == n: self._revealed_count -= to_index - from_start self._intervals.remove(from_start) self._intervals.add(to_index) else: self._revealed_count -= (from_end - from_start) + (to_index - self._intervals[n - 1]) + ( from_index - from_start) + sum( b - a for a, b in zip(intermediates[::2], intermediates[1::2])) del self._intervals[m + 1:n - 1] # intermediates self._intervals.remove(from_start) self._intervals.remove(from_end) self._intervals.remove(to_start) self._intervals.add(to_index) else: from_start = 0 if m == 0 else self._intervals[m - 1] from_end = self._intervals[m] self._revealed_count -= from_end - from_start if from_start > 0: self._intervals.remove(from_start) self._intervals.remove(from_end) else: if to_index_is_included: from_end = self._intervals[m] to_start = self._intervals[n - 1] if m == n: self._revealed_count -= to_index - from_index if from_index > 0: self._intervals.add(from_index) self._intervals.add(to_index) else: intermediates = self._intervals[m + 1:n - 1] self._revealed_count -= (from_end - from_index) + (to_index - to_start) + sum( b - a for a, b in zip(intermediates[::2], intermediates[1::2])) del self._intervals[m + 1:n - 1] # intermediates if from_index > 0: self._intervals.add(from_index) self._intervals.add(to_index) self._intervals.remove(from_end) self._intervals.remove(to_start) else: to_remove = self._intervals[m:n] self._revealed_count -= self._intervals[m] - from_index + sum(b - a for a, b in zip(to_remove[1::2], to_remove[::2])) del self._intervals[m:n] if from_index != 0: self._intervals.add(from_index) else: if to_index_is_included: if to_index_right_of_excluded: to_remove = self._intervals[m:n - 1] del self._intervals[m:n - 1] self._revealed_count -= sum(b - a for a, b in zip(to_remove[::2], to_remove[1::2])) else: to_remove = self._intervals[m:n] del self._intervals[m:n] self._intervals.add(to_index) self._revealed_count -= (to_index - to_remove[0]) + sum(b - a for a, b in zip(to_remove[1::2], to_remove[::2])) else: to_remove = self._intervals[m:n] del self._intervals[m:n] self._revealed_count -= sum(b - a for a, b in zip(to_remove[::2], to_remove[1::2])) return original_length - self._revealed_count def exclude_virtual(self, from_index: Optional[int], to_index: Optional[int]): if from_index is None or from_index < -len(self) or from_index >= len(self): p_from_index = None else: p_from_index = self.virtual2physical(from_index) if to_index is None or to_index < -len(self) or to_index >= len(self): p_to_index = None else: p_to_index = self.virtual2physical(to_index) return self.exclude(p_from_index, p_to_index) def include(self, from_index: Optional[int], to_index: Optional[int]): original_length = len(self) if isinstance(from_index, int) and -self.universe.stop <= from_index < 0: from_index = from_index % self.universe.stop if isinstance(to_index, int): if to_index > self.universe.stop: return self.include(from_index, None) if -self.universe.stop <= to_index < 0: to_index = to_index % self.universe.stop assert from_index is None or self.universe.start <= from_index <= self.universe.stop assert to_index is None or self.universe.start <= to_index <= self.universe.stop if from_index is None: from_index = self.universe.start if to_index is None: to_index = self.universe.stop if not self._intervals: if from_index > 0: self._intervals.add(from_index) self._intervals.add(to_index) self._revealed_count += to_index - from_index return to_index - from_index if from_index == to_index: return 0 m = self._intervals.bisect_right(from_index) n = self._intervals.bisect_right(to_index) try: from_index_index = self._intervals.index(from_index) except ValueError: from_index_index = None from_index_is_included = ( len(self._intervals) % 2 == 0 and m % 2 == 1 or len(self._intervals) % 2 == 1 and m % 2 == 0) to_index_is_included = ( len(self._intervals) % 2 == 0 and n % 2 == 1 or len(self._intervals) % 2 == 1 and n % 2 == 0) from_index_right_of_included = from_index_index is not None and ( len(self._intervals) % 2 == 0 and from_index_index % 2 == 1 or len(self._intervals) % 2 == 1 and from_index_index % 2 == 0) if from_index_is_included: if to_index_is_included: to_remove = self._intervals[m:n] del self._intervals[m:n] self._revealed_count += sum(b - a for a, b in zip(to_remove[::2], to_remove[1::2])) else: to_remove = self._intervals[m:n] del self._intervals[m:n] self._intervals.add(to_index) self._revealed_count += (to_index - to_remove[-1]) + sum(b - a for a, b in zip(to_remove[1::2], to_remove[::2])) else: if to_index_is_included: if from_index_right_of_included: to_remove = self._intervals[m - 1:n] del self._intervals[m - 1:n] self._revealed_count += sum(b - a for a, b in zip(to_remove[::2], to_remove[1::2])) else: to_remove = self._intervals[m:n] del self._intervals[m:n] self._intervals.add(from_index) self._revealed_count += (to_remove[0] - from_index) + sum(b - a for a, b in zip(to_remove[1::2], to_remove[::2])) else: if from_index_right_of_included: intermediates = self._intervals[m:n] del self._intervals[m:n] # intermediates self._intervals.remove(from_index) self._intervals.add(to_index) self._revealed_count += (to_index - from_index) - sum(b - a for a, b in zip(intermediates[::2], intermediates[1::2])) else: to_remove = self._intervals[m:n] del self._intervals[m:n] if from_index > 0: self._intervals.add(from_index) self._intervals.add(to_index) self._revealed_count += (to_index - from_index) - sum(b - a for a, b in zip(to_remove[::2], to_remove[1::2])) return len(self) - original_length def include_partially(self, from_index: Optional[int], to_index: Optional[int], count: Union[int, tuple]): if isinstance(count, int): return self.include_partially(from_index, to_index, (count, count)) head_count, tail_count = count head_revealed_count = self._include_partially_from_left(from_index, to_index, head_count) tail_revealed_count = self._include_partially_from_right(from_index, to_index, tail_count) return head_revealed_count + tail_revealed_count def _include_partially_from_left(self, from_index: int, to_index: int, count: int): if count == 0: return 0 from_index, to_index = self._normalized_range(from_index, to_index) subsel = self._spanning_subslice(from_index, to_index).complement().subslice(from_index, to_index) revealed_count = 0 for covered_start, covered_stop in subsel.pairs(): coverage = covered_stop - covered_start if revealed_count + coverage < count: self.include(covered_start, covered_stop) revealed_count += coverage else: self.include(covered_start, covered_start + count - revealed_count) revealed_count = count break return revealed_count def _include_partially_from_right(self, from_index: int, to_index: int, count: int): if count == 0: return 0 from_index, to_index = self._normalized_range(from_index, to_index) subsel = self._spanning_subslice(from_index, to_index).complement().subslice(from_index, to_index) revealed_count = 0 for covered_start, covered_stop in reversed(list(subsel.pairs())): coverage = covered_stop - covered_start if revealed_count + coverage < count: self.include(covered_start, covered_stop) revealed_count += coverage else: self.include(covered_stop - (count - revealed_count), covered_stop) revealed_count = count break return revealed_count def include_expand(self, from_index: Optional[int], to_index: Optional[int], count: Union[int, Tuple[int, int]]): if isinstance(count, int): return self.include_expand(from_index, to_index, (count, count)) if count == (0, 0): return 0 head_count, tail_count = count revealed_counter = 0 gaps = self.complement().subslice(from_index, to_index) for a, b in gaps.pairs(): if b < self.universe.stop: revealed_counter += self._include_partially_from_right(a, b, head_count) if a > self.universe.start: revealed_counter += self._include_partially_from_left(a, b, tail_count) return revealed_counter def _previous_slice(self, sl: slice): """ :return The revealed or covered slice immediately to the left of @sl. :raise ValueError if there is none. """ if sl.start == self.universe.start: raise ValueError("There is no slice to the left of {}.".format(sl)) # TODO O(n) -> O(1) zero_or_one = [s for s in self._intervals + self.complement()._intervals if s.stop == sl.start] if len(zero_or_one) == 1: return zero_or_one[0] else: raise ValueError("Slice not found: {}.".format(sl)) def _next_slice(self, sl: slice): """ :return The revealed or covered slice immediately to the right of @sl. :raise ValueError if there is none. """ if sl.stop == self.universe.stop: raise ValueError("There is no slice to the right of {}.".format(sl)) # TODO O(n) zero_or_one = [s for s in self._intervals + self.complement()._intervals if s.start == sl.stop] if len(zero_or_one) == 1: return zero_or_one[0] else: raise ValueError("Slice not found: {}.".format(sl)) def include_virtual(self, from_index, to_index): if from_index is None or from_index < -len(self) or from_index >= len(self): p_from_index = None else: p_from_index = self.virtual2physical(from_index) if to_index is None or to_index < -len(self) or to_index >= len(self): p_to_index = None else: p_to_index = self.virtual2physical(to_index) return self.include(p_from_index, p_to_index) def include_partially_virtual(self, from_index: Optional[int], to_index: Optional[int], count: Union[int, tuple]): if from_index is None or from_index < -len(self) or from_index >= len(self): p_from_index = None else: p_from_index = self.virtual2physical(from_index) if to_index is None or to_index < -len(self) or to_index >= len(self): p_to_index = None else: p_to_index = self.virtual2physical(to_index) return self.include_partially(p_from_index, p_to_index, count) # FIXME Inconsistent with reversed(selection). Should probably make this use the default implementation and instead # rewrite this one to iter_slices or something. def __iter__(self): for a, b in self.pairs(): yield a, b # FIXME should probably generate slices instead, or every index def complement(self): if len(self._intervals) >= 1 and self._intervals[-1] == self.universe.stop: return Selection(universe=self.universe, intervals=self._intervals[:-1], _length=self.universe.stop - len(self)) return Selection(universe=self.universe, intervals=self._intervals.union([self.universe.stop]), _length=self.universe.stop - len(self)) def _normalized_range(self, from_index: Optional[int], to_index: Optional[int]) -> Tuple[int, int]: """ For any range [@from_index, @to_index) where the indices are either None or any integer, returns the equivalent range [x, y) such that either 0 <= x < y <= upper_bound or x = y = 0. The ranges are equivalent in the sense that when using them to slice this selection, they produce the same sub-selection. """ if from_index is None or from_index <= -self.universe.stop: from_index = self.universe.start elif from_index > self.universe.stop: from_index = self.universe.stop elif -self.universe.stop <= from_index < 0: from_index = self.universe.stop - from_index if to_index is None or to_index >= self.universe.stop: to_index = self.universe.stop elif -self.universe.stop <= to_index < 0: to_index = self.universe.stop - to_index elif to_index < -self.universe.stop: to_index = self.universe.start if from_index >= to_index: from_index, to_index = (0, 0) return from_index, to_index def subslice(self, from_index: Optional[int], to_index: Optional[int]): from_index, to_index = self._normalized_range(from_index, to_index) sel = self._spanning_subslice(from_index, to_index) if len(sel._intervals) % 2 == 0: if len(sel) > 0: if sel._intervals[0] < from_index < sel._intervals[1]: sel._revealed_count -= from_index - sel._intervals[0] del sel._intervals[0] sel._intervals.add(from_index) if sel._intervals[-2] < to_index < sel._intervals[-1]: sel._revealed_count -= sel._intervals[-1] - to_index del sel._intervals[-1] sel._intervals.add(to_index) else: if 0 < from_index < sel._intervals[0]: sel._revealed_count -= from_index sel._intervals.add(from_index) if (len(sel._intervals) == 1 and to_index < sel._intervals[-1] or len(sel._intervals) >= 2 and sel._intervals[-2] < to_index < sel._intervals[-1]): sel._revealed_count -= sel._intervals[-1] - to_index del sel._intervals[-1] sel._intervals.add(to_index) return sel def _spanning_subslice(self, from_index: int, to_index: int): """ :return A Selection whose set of revealed slices is a subset of that of this Selection such that every index in [from_index, to_index) is either on some slice in the subset, or on a gap. """ if from_index >= to_index: return Selection(universe=deepcopy(self.universe), intervals=[]) m = self._intervals.bisect_right(from_index) if len(self._intervals) % 2 == 0: n = self._intervals.bisect_left(to_index) intervals = self._intervals[m - (m % 2):n + (n % 2)] else: n = self._intervals.bisect_right(to_index) a = max(0, m - ((m + 1) % 2)) b = n + ((n + 1) % 2) intervals = self._intervals[a:b] sel = Selection(universe=deepcopy(self.universe), intervals=intervals) return sel def _slow_subslice(self, from_index: Optional[int], to_index: Optional[int]): sel = self.deepcopy() if isinstance(from_index, int): sel.exclude(None, from_index) if isinstance(to_index, int): sel.exclude(to_index, None) return sel def _interval_index(self, pindex): """ :return n if the nth interval edge is the smallest number such that @pindex < n (zero-indexed). """ lower = 0 upper = len(self._intervals) - 1 while lower <= upper: middle = (lower + upper) // 2 midsl = self._intervals[middle] if pindex < midsl.start: upper = middle - 1 elif midsl.stop <= pindex: lower = middle + 1 else: # midsl.start <= pindex < midsl.stop: return middle raise IndexError("{} is not in any interval.".format(pindex)) def select(self, listlike): # TODO only works for stringlike objects lst = [] for interval in self.slices(): lst.append(listlike[interval]) selection = listlike[0:0].join(lst) return selection def physical2virtual(self, pindex: int): vindex = 0 for a, b in self.pairs(): if a <= pindex < b: vindex += pindex - a return vindex vindex += b - a raise IndexError("Physical index {} out of bounds for selection {}".format(pindex, self)) # TODO: O(n) -> O(log(n)) (using another sorted set for cumulative lengths?) def virtual2physical(self, vindex: int): # TODO -> virtualint2physical """ :return the integer n such that where the @vindex'th revealed element is the nth element. If @vindex < 0, @vindex is interpreted as (number of revealed elements) + @vindex. """ if vindex < -len(self): raise IndexError( "Got index {}, expected it to be within range [{},{})".format(vindex, -len(self), len(self))) elif vindex < 0: return self.virtual2physical(len(self) + vindex) cumlength = 0 for a, b in self.pairs(): cumlength += b - a if vindex < cumlength: pindex = b - (cumlength - vindex) if a <= pindex < b: return pindex else: break raise IndexError("Virtual index {} out of bounds for selection {}".format(vindex, self)) def virtual2physicalselection(self, vslice: slice) -> 'Selection': # TODO -> virtualslice2physical """ :return the sub-Selection that is the intersection of this selection and @vslice. """ if not self._intervals or vslice.stop == 0: return Selection(self.universe, revealed=[]) if vslice.start is None: a = self.virtual2physical(0) elif -len(self) <= vslice.start < len(self): a = self.virtual2physical(vslice.start) elif vslice.start >= len(self): a = self._intervals[-1] else: raise ValueError("Unexpected slice start: {}".format(vslice)) if vslice.stop is None or vslice.stop >= len(self): b = self._intervals[-1] - 1 elif -len(self) <= vslice.stop < len(self): b = self.virtual2physical(vslice.stop - 1) else: raise ValueError("Unexpected slice stop: {}".format(vslice)) # INV: a is the physical index of the first element, b is the physical index of the last element if b < a: return Selection(universe=self.universe, revealed=[]) m = self._intervals.bisect_right(a) n = self._intervals.bisect_right(b) intervals = SortedSet([a] + self._intervals[m:n] + [b + 1]) return Selection(universe=self.universe, intervals=intervals) def virtualselection2physical(self, vselection: 'Selection'): # TODO -> virtualslice2physical """ :return the sub-Selection that is the intersection of this selection and @vselection. """ intervals = [] for start, stop in vselection: for a, b in self.virtual2physicalselection(slice(start, stop)): intervals.append(slice(a, b)) return Selection(universe=self.universe, revealed=intervals) def stretched(self, from_index: Optional[int], to_index: Optional[int]): # TODO remove? """ :return A potentially shrinked deep copy of this selection, delimited by the universe [@from_index, @to_index). """ m = self._intervals.bisect_right(from_index) n = self._intervals.bisect_right(to_index) intervals = self._intervals[m:n] return Selection(universe=slice(from_index, to_index), intervals=intervals) def __getitem__(self, item): return self.virtual2physical(item) @staticmethod def _compute_len(sortedset: SortedSet): """ :return The sum of the lengths of every slice in @slicelist. """ if len(sortedset) == 0: return 0 elif len(sortedset) % 2 == 0: return sum(sortedset[i + 1] - sortedset[i] for i in range(0, len(sortedset), 2)) return sortedset[0] + sum(sortedset[i + 1] - sortedset[i] for i in range(1, len(sortedset), 2)) def __len__(self): return self._revealed_count def __eq__(self, other): return repr(self) == repr(other) def __mul__(self, other: int): if other == 0: return Selection(universe=slice(0, 0), revealed=[]) scaled_universe = slice(self.universe.start * other, self.universe.stop * other) scaled_revealed = [other * x for x in self._intervals] return Selection(universe=scaled_universe, intervals=scaled_revealed) def __rmul__(self, other): return self.__mul__(other) def __repr__(self): return "{}(universe={}, intervals={})".format(self.__class__.__name__, self.universe, self._intervals) def __str__(self): return repr(self) def deepcopy(self): """ :return A deep copy of this object. """ return Selection(universe=deepcopy(self.universe), intervals=deepcopy(self._intervals))
def preprocess_articles(langs, date_start=None, date_end=None, pca_dim=300, disallow_repeats=False): global DONE_PROCESSING extracted_path = os.path.join( REUTERS_DIRECTORY, 'preprocessed/%s--%s.pkl' % (date_start.strftime("%m%d%Y"), date_end.strftime("%m%d%Y"))) if os.path.exists(extracted_path): with open(extracted_path, 'rb') as f: articles = pickle.load(f) else: warnings.filterwarnings( "ignore", message= "Due to multiword token expansion or an alignment issue, the original text has been replaced by space-separated expanded tokens." ) warnings.filterwarnings( "ignore", message= "Can't set named entities because of multi-word token expansion or because the character offsets don't map to valid tokens produced by the Stanza tokenizer:" ) articles = [] q = queue.Queue() for thread_num in range(NUM_WORKERS): threading.Thread(target=preprocess_articles_worker, args=(q, articles)).start() for article_num, article in enumerate( get_articles(langs, date_start, date_end)): q.put((article_num, article)) q.join() DONE_PROCESSING = True with open(extracted_path, 'wb') as f: pickle.dump(articles, f) print("Processed %s articles!" % len(articles)) noun_and_verb_vocabulary_path = os.path.join( REUTERS_DIRECTORY, 'preprocessed/%s--%s-nouns-and-verbs.pkl' % (date_start.strftime("%m%d%Y"), date_end.strftime("%m%d%Y"))) noun_and_verb_embeddings_path = os.path.join( REUTERS_DIRECTORY, 'preprocessed/%s--%s-noun-and-verb-embeddings.npy' % (date_start.strftime("%m%d%Y"), date_end.strftime("%m%d%Y"))) article_nouns_and_verbs_path = os.path.join( REUTERS_DIRECTORY, 'preprocessed/%s--%s-article-nouns-and-verbs.pkl' % (date_start.strftime("%m%d%Y"), date_end.strftime("%m%d%Y"))) if not all( map(os.path.exists, [ noun_and_verb_vocabulary_path, noun_and_verb_embeddings_path, article_nouns_and_verbs_path ])): lang_counts = Counter(map(lambda article: article.lang, articles)) print("%s articles!" % (len(articles))) print(lang_counts) ### # NOUNS AND VERBS ### # now, for our nouns and verbs, we define a vocabulary. # keep all nouns and verbs that: # 1) are in less than 30% of documents (ie, contain informative signal) # 2) aren't in our (rather aggressive) list of stopwords for each language (ie, contain information signal) # 3) are in the set of word embeddings published by MUSE noun_and_verb_counts = defaultdict(Counter) for article in articles: for noun_or_verb in article.nouns_and_verbs: if noun_or_verb != noun_or_verb.strip(): print(noun_or_verb) noun_and_verb_counts[article.lang][noun_or_verb] += 1 print("Unfiltered noun/verb vocab size") print({ lang: len(noun_and_verb_counts[lang]) for lang in noun_and_verb_counts.keys() }) noun_and_verbs_by_lang = defaultdict(set) noun_and_verb_vocabulary = SortedSet() for lang in noun_and_verb_counts: for noun_or_verb, count in noun_and_verb_counts[lang].items(): if count > 0.3 * lang_counts[lang]: continue if noun_or_verb in STOP_WORDS[lang]: continue if noun_or_verb not in EMBEDDINGS[lang]: continue noun_and_verb_vocabulary.add((lang, noun_or_verb)) noun_and_verbs_by_lang[lang].add(noun_or_verb) print("Filtered noun/verb vocab size for %s=%s" % (lang, len(noun_and_verbs_by_lang[lang]))) noun_and_verb_embeddings = np.array([ EMBEDDINGS[lang][noun_or_verb] for lang, noun_or_verb in noun_and_verb_vocabulary ]) # optionally reduce dimensionality (don't need to hold onto PCA matrix) if pca_dim < noun_and_verb_embeddings.shape[1]: print("Reducing embedding dimensionality from %s to %s" % (noun_and_verb_embeddings.shape[1], pca_dim)) noun_and_verb_embeddings = PCA(pca_dim).fit_transform( noun_and_verb_embeddings) article_nouns_and_verbs = [] for article_id, article in enumerate(articles): article_nouns_and_verbs.append([]) for noun_or_verb, count in article.nouns_and_verbs.items(): if (article.lang, noun_or_verb) not in noun_and_verb_vocabulary: continue noun_or_verb_id = noun_and_verb_vocabulary.index( (article.lang, noun_or_verb)) for _ in range(count): article_nouns_and_verbs[-1].append(noun_or_verb_id) with open(noun_and_verb_vocabulary_path, 'wb') as f: pickle.dump(noun_and_verb_vocabulary, f) np.save(noun_and_verb_embeddings_path.strip(".npy"), noun_and_verb_embeddings) with open(article_nouns_and_verbs_path, 'wb') as f: pickle.dump(article_nouns_and_verbs, f) print("Wrote nouns and verbs of size to %s" % (noun_and_verb_data_path)) else: with open(noun_and_verb_vocabulary_path, 'rb') as f: noun_and_verb_vocabulary = pickle.load(f) noun_and_verb_embeddings = np.load(noun_and_verb_embeddings_path) with open(article_nouns_and_verbs_path, 'rb') as f: article_nouns_and_verbs = pickle.load(f) ### # NAMED ENTITIES ### named_entity_vocabulary_path = os.path.join( REUTERS_DIRECTORY, 'preprocessed/%s--%s-named-entities.pkl' % (date_start.strftime("%m%d%Y"), date_end.strftime("%m%d%Y"))) article_named_entities_path = os.path.join( REUTERS_DIRECTORY, 'preprocessed/%s--%s-article-named-entities.pkl' % (date_start.strftime("%m%d%Y"), date_end.strftime("%m%d%Y"))) if not all( map(os.path.exists, [named_entity_vocabulary_path, article_named_entities_path])): named_entities = set() for article in articles: for named_entity in article.named_entities: named_entities.add((article.lang, named_entity)) named_entities = list(sorted(named_entities)) print("Ungrouped named entities: %s" % len(named_entities)) es_named_entities = [ named_entity for (lang, named_entity) in named_entities if lang == 'es' ] es_named_entities_translated = translate_named_entities( 'es', es_named_entities) ru_named_entities = [ named_entity for (lang, named_entity) in named_entities if lang == 'ru' ] ru_named_entities_translated = translate_named_entities( 'ru', ru_named_entities) grouped_named_entity_counts = Counter() for article in articles: for named_entity in article.named_entities: if article.lang == 'es': named_entity = es_named_entities_translated[named_entity] elif article.lang == 'ru': named_entity = ru_named_entities_translated[named_entity] grouped_named_entity_counts[named_entity] += 1 named_entity_vocabulary = SortedSet() for named_entity, count in grouped_named_entity_counts.items(): if count > 5 and count < 0.8 * len( articles) and 'reuters' not in named_entity: named_entity_vocabulary.add(named_entity) print("Grouped named entities: %s" % len(named_entity_vocabulary)) article_named_entities = [] for article_id, article in enumerate(articles): article_named_entities.append([]) for named_entity, count in article.named_entities.items(): if article.lang == 'es': named_entity = es_named_entities_translated[named_entity] elif article.lang == 'ru': named_entity = ru_named_entities_translated[named_entity] if named_entity not in named_entity_vocabulary: continue named_entity_id = named_entity_vocabulary.index(named_entity) for _ in range(count): article_named_entities[-1].append(named_entity_id) with open(named_entity_vocabulary_path, 'wb') as f: pickle.dump(named_entity_vocabulary, f) with open(article_named_entities_path, 'wb') as f: pickle.dump(article_named_entities, f) else: with open(named_entity_vocabulary_path, 'rb') as f: named_entity_vocabulary = pickle.load(f) with open(article_named_entities_path, 'rb') as f: article_named_entities = pickle.load(f) if disallow_repeats: for article_id in range(len(articles)): article_nouns_and_verbs[article_id] = list( set(article_nouns_and_verbs[article_id])) article_named_entities[article_id] = list( set(article_named_entities[article_id])) return TrainingData(articles, noun_and_verb_vocabulary, noun_and_verb_embeddings, article_nouns_and_verbs, named_entity_vocabulary, article_named_entities)
class SweepLineStatus: def __init__(self, scenes=[]): global X super().__init__() self.lines: SortedSet = None self.events: SortedSet = SortedSet(key=lambda p: -p.x) self.scenes = scenes self.dataset = {} self.results = set([]) def run(self, dataset: List[Line]): global X for l in dataset: self.dataset[l.get_left()] = l self.dataset[l.get_right()] = l self.events.add(l.get_left()) self.events.add(l.get_right()) self.lines = SortedSet() while len(self.events) > 0: print('iteration') event = self.events.pop() self.event_happened(event) return self.results def find_intersection(self, line1: Line, line2: Line): return line_intersection(line1, line2) def insert_line(self, line: Line): global X print(X) self.lines.add(line) try: i = self.lines.index(line) except: print('error') self.update_keys(X) i = self.lines.index(line) if i - 1 >= 0 and i + 1 < len(self.lines): intersection = self.find_intersection(self.lines[i - 1], self.lines[i + 1]) if intersection is not None and intersection in self.events: self.events.remove(intersection) if i - 1 >= 0: intersection = self.find_intersection(self.lines[i - 1], line) if intersection is not None and intersection not in self.results and intersection not in self.events: self.events.add(intersection) if i + 1 < len(self.lines): intersection = self.find_intersection(line, self.lines[i + 1]) if intersection is not None and intersection not in self.results and intersection not in self.events: self.events.add(intersection) def update_keys(self, x): global X temp_lines = SortedSet() temp_lines.update(self.lines) self.lines = temp_lines def remove_line(self, line: Line): global X print(X) try: i = self.lines.index(line) except: # print('error') self.update_keys(X) i = self.lines.index(line) if i - 1 >= 0 and i + 1 < len(self.lines): intersection = self.find_intersection(self.lines[i - 1], self.lines[i + 1]) if intersection is not None and intersection not in self.results: self.events.add(intersection) self.lines.remove(line) def intersection_event(self, intersection: Point): global X print(X) X = intersection.x - 0.001 self.results.add(intersection) line1 = intersection.line1 line2 = intersection.line2 self.remove_line(line1) self.remove_line(line2) X = intersection.x + 0.001 self.insert_line(line1) self.insert_line(line2) def event_happened(self, event: Point): global X X = event.x print(X) if event in self.dataset: line = self.dataset[event] if event == line.get_left(): self.insert_line(line) else: self.remove_line(line) else: self.intersection_event(event)
class FirstLastList: def __init__(self): self.sorted_elements = SortedSet() self.ordered_elements = OrderedDict() def count(self): return len(self.ordered_elements) def clear(self): self.sorted_elements = SortedSet() self.ordered_elements = OrderedDict() def add(self, element): # add in the sorted container sorted_element = ElementWrapper(element) if sorted_element in self.sorted_elements: # increment his count sorted_element_idx = self.sorted_elements.index(sorted_element) self.sorted_elements[sorted_element_idx].count += 1 else: self.sorted_elements.add(sorted_element) element_occurence = self.sorted_elements[self.sorted_elements.index( sorted_element)].count # add in the ordered container ordered_element = ElementOrderWrapper(element, element_occurence) self.ordered_elements[ordered_element] = True def min(self, count): count_left = count min_items = [] to_break = False for i in range(len(self.sorted_elements)): min_obj = self.sorted_elements[i] for _ in range(min_obj.count): count_left -= 1 min_items.append(min_obj.value) if count_left == 0: to_break = True break if to_break: break return min_items def max(self, count): count_left = count max_items = [] to_break = False for i in range(1, len(self.sorted_elements) + 1): max_obj = self.sorted_elements[-i] for _ in range(max_obj.count): count_left -= 1 max_items.append(max_obj.value) if count_left == 0: to_break = True break if to_break: break return max_items def first(self, count): return list(self.ordered_elements.keys())[:count] def last(self, count): start = len(self.ordered_elements) - count if start < 0: start = 0 keys = list(self.ordered_elements.keys()) return [keys[i] for i in reversed(range(start, len(keys)))] def remove_all(self, element): el_obj = ElementWrapper(element) if el_obj not in self.sorted_elements: return 0 # remove from the sorted collection el_idx = self.sorted_elements.index(el_obj) element_obj = self.sorted_elements[el_idx] self.sorted_elements.remove(element_obj) # remove from the order collection for occurence in range(1, element_obj.count + 1): el_wrapper = ElementOrderWrapper(element, occurence) del self.ordered_elements[el_wrapper] return element_obj.count
class BunnyWars: def __init__(self): self.rooms_by_idx = SortedSet() # integer ID only self.rooms = SortedDict() # key: id, value: room self.bunnies_by_team = { } # key: team id, value: SortedSet(key=bunny.reversed_name) of Bunny objects self.bunnies_by_suffix = datrie.Trie(string.ascii_letters + ''.join( str(part) for part in range(0, 10))) self.bunny_names = {} def next_bunny(self, bunny_name): self._move_bunny(bunny_name) def prev_bunny(self, bunny_name): self._move_bunny(bunny_name, prev=True) def bunny_count(self): return len(self.bunny_names) def room_count(self): return len(self.rooms) def list_bunnies_by_team(self, team_id): """ ListBunniesByTeam teamId - returns all bunnies from the specified team in (sorted by name in descending order). """ return reversed(self.bunnies_by_team[team_id]) def list_bunnies_by_suffix(self, suffix): """ ListBunniesBySuffix suffix - returns all bunnies ending with the specified suffix (sorted by the ASCII code of the reversed name in ascending order as a first criteria and by length in ascending order as a second criteria). Example Tpen < apen < aapen < bapen < bpen. """ return self.bunnies_by_suffix.values(''.join(reversed(suffix))) def detonate(self, bunny_name): if bunny_name not in self.bunny_names: raise Exception('Bunny does not exist!') bunny = self.bunny_names[bunny_name] room = self.rooms[bunny.room] dead_bunnies = room.detonate( bunny) # detonate the bunny and get all the bunnies that have died for dead_bunny in dead_bunnies: self._delete_bunny(dead_bunny) def add_room(self, id): """ Add roomId – adds a room to the structure. Rooms have unique ids. Rooms should be situated according to their id in ascending order. If a room with the given Id exists the command should throw an exception. """ if id in self.rooms: raise Exception( 'Room with id {id} is already registered!'.format(id=id)) self.rooms_by_idx.add(id) self.rooms[id] = Room(id) def add_bunny(self, bunny_name, team_id, room_id): if room_id not in self.rooms or team_id > 4 or team_id < 0: raise Exception('Invalid room/team id!') if bunny_name in self.bunny_names: raise Exception('A bunny with the given name already exists!') bunny_obj = Bunny(name=bunny_name, teamid=team_id, room=room_id) # 1. Add to the room self.rooms[room_id].add_bunny(bunny_obj) # 2. Add to overall bunnies self.bunny_names[bunny_name] = bunny_obj # 3. Add to suffixes self.bunnies_by_suffix[bunny_obj.reversed_name] = bunny_obj # 4. Add to bunnies by team if bunny_obj.team not in self.bunnies_by_team: self.bunnies_by_team[bunny_obj.team] = SortedSet() self.bunnies_by_team[bunny_obj.team].add(bunny_obj) def remove_room(self, room_id): if room_id not in self.rooms: raise Exception( 'A room with the id {id} does not exist!'.format(id=room_id)) room = self.rooms[room_id] del self.rooms[room_id] self.rooms_by_idx.remove(room_id) # delete every bunny there for bunnies_from_team in room.bunnies.values(): for bunny in bunnies_from_team.values(): self._delete_bunny(bunny) def _move_bunny(self, bunny_name, prev=False): if bunny_name not in self.bunny_names: raise Exception() bunny = self.bunny_names[bunny_name] old_room_id = bunny.room old_room = self.rooms[old_room_id] old_room_index = self.rooms_by_idx.index(old_room_id) if prev: next_room_index = old_room_index - 1 else: next_room_index = old_room_index + 1 if next_room_index >= len( self.rooms_by_idx) or next_room_index < 0: # is out of bounds next_room_index = 0 if prev else len(self.rooms_by_idx) - 1 # get the new room id and assign it to the bunny new_room_id = self.rooms_by_idx[next_room_index] bunny.room = new_room_id new_room = self.rooms[new_room_id] # remove the bunny from the old room and move it to the new one old_room.remove_bunny(bunny) new_room.move_bunny_in(bunny) def _delete_bunny(self, bunny: Bunny): # 1.Remove from overall bunnies del self.bunny_names[bunny.name] # 2.Remove from suffixes del self.bunnies_by_suffix[bunny.reversed_name] # 3.Remove from bunnies by team self.bunnies_by_team[bunny.team].remove(bunny)
def cluster_(self, fX): """Compute complete dendrogram Parameters ---------- fX : (n_items, dimension) np.array Embeddings. Returns ------- dendrogram : list of (i, j, distance) tuples Dendrogram. """ N = len(fX) # clusters contain the identifier of each cluster clusters = SortedSet(np.arange(N)) # labels[i] = c means ith item belongs to cluster c labels = np.array(np.arange(N)) squared = squareform(pdist(fX, metric=self.metric)) distances = ValueSortedDict() for i, j in itertools.combinations(range(N), 2): distances[i, j] = squared[i, j] dendrogram = [] for _ in range(N-1): # find most similar clusters (c_i, c_j), d = distances.peekitem(index=0) # keep track of this iteration dendrogram.append((c_i, c_j, d)) # index of clusters in 'clusters' and 'fX' i = clusters.index(c_i) j = clusters.index(c_j) # merge items of cluster c_j into cluster c_i labels[labels == c_j] = c_i # update c_i representative fX[i] += fX[j] # remove c_j cluster fX[j:-1, :] = fX[j+1:, :] fX = fX[:-1] # remove distances to c_j cluster for c in clusters[:j]: distances.pop((c, c_j)) for c in clusters[j+1:]: distances.pop((c_j, c)) clusters.remove(c_j) if len(clusters) < 2: continue # compute distance to new c_i cluster new_d = cdist(fX[i, :].reshape((1, -1)), fX, metric=self.metric).squeeze() for c_k, d in zip(clusters, new_d): if c_k < c_i: distances[c_k, c_i] = d elif c_k > c_i: distances[c_i, c_k] = d return dendrogram
class SparseTimeSeriesDataSet: # A dataset designed for dealing with sparse time series data that needs to be kept in sync in time. def __init__(self, unique_timestamps = None, minimum_time_between_timestamps = None, mode='strict'): # possible modes are strict, remove_difference, union if unique_timestamps is not None: self.unique_timestamps = SortedSet(unique_timestamps) else: self.unique_timestamps = SortedSet() self.mode = mode self.all_raw_data = {} #dict of sorteddicts self.timestamp_indexed_data = {} self.minimum_time_between_timestamps = minimum_time_between_timestamps self.check_minimum_timestamp_interval() def __len__(self): return len(self.unique_timestamps) @classmethod def sample_data_at_intervals(cls, start_timestamp, end_timestamp, interval, data): # extends previous datapoint if one is missing timestamps = SortedList([x[0] for x in data]) start_timestamp = int(start_timestamp) end_timestamp = int(end_timestamp) assert(timestamps[0] <= start_timestamp) assert(timestamps[-1] >= end_timestamp) sampled_data = [] for timestamp in range(start_timestamp, end_timestamp+1, interval): index = timestamps.bisect_right(timestamp)-1 new_datapoint = data[index].copy() new_datapoint[0] = timestamp sampled_data.append(new_datapoint) return sampled_data @property def ids(self): return list(self.all_raw_data.keys()) @property def first_timestamp(self): return self.unique_timestamps[0] def first_timestamp_for_id(self, id): return self.all_raw_data[id][0][0] @property def last_timestamp(self): return self.unique_timestamps[-1] def last_timestamp_for_id(self, id): return self.all_raw_data[id][-1][0] def first_unpadded_index_for_id(self, id): first_timestamp = self.first_timestamp_for_id(id) return self.unique_timestamps.index(first_timestamp) def last_unpadded_index_for_id(self, id): last_timestamp = self.last_timestamp_for_id(id) return self.unique_timestamps.index(last_timestamp) def check_minimum_timestamp_interval(self): if self.minimum_time_between_timestamps is not None: prev_timestamp = 0 for timestamp in self.unique_timestamps: if timestamp-prev_timestamp < self.minimum_time_between_timestamps: raise InvalidTimestampsInDataError("Found timestamps that have less than the required {} between them".format(self.minimum_time_between_timestamps)) prev_timestamp = timestamp def add(self, id: str, data): if len(data) == 0: raise ValueError("Tried to add empty data for id {}".format(id)) if id in self.all_raw_data and self.all_raw_data[id] == data: print("Data for id {} already added.".format(id)) return self.all_raw_data[id] = data if len(data[0]) > 2: # we have multidimensional data timestamp_indexed_data = SortedDict([[int(x[0]), x[1:]] for x in data]) else: timestamp_indexed_data = SortedDict([[int(x[0]), x[1]] for x in data]) new_timestamps = {x[0] for x in data} difference = new_timestamps.difference(self.unique_timestamps) if self.mode == 'strict': if len(difference) != 0: raise InvalidTimestampsInDataError("Tried to add new data with id {} that includes timestamps that are not in the set of allowed timestamps. " "Difference = {}".format(id, difference)) opposite_difference = self.unique_timestamps.difference(new_timestamps) # for timestamp_current in opposite_difference: # if timestamp_current > min(new_timestamps) and timestamp_current < max(new_timestamps): # raise Exception("Missing timestamps in the middle of the data") elif self.mode == 'remove_difference': for timestamp_to_remove in difference: del(timestamp_indexed_data[timestamp_to_remove]) elif self.mode == 'union': self.unique_timestamps = self.unique_timestamps.union(new_timestamps) self.check_minimum_timestamp_interval() if len(timestamp_indexed_data) == 0: raise NotEnoughInputData("The data being added has zero length. If the mode is remove_difference, then this means that the new data has no timestamps in common with the required timestamps") self.timestamp_indexed_data[id] = timestamp_indexed_data def get_left_and_right_padding_required(self, ids): padding_required = [] for id in ids: first_timestamp_for_id = self.first_timestamp_for_id(id) last_timestamp_for_id = self.last_timestamp_for_id(id) left_padding = self.unique_timestamps.index(first_timestamp_for_id) right_padding = len(self) - self.unique_timestamps.index(last_timestamp_for_id)-1 assert(self.all_raw_data[id][0][0] == self.unique_timestamps[left_padding]) assert(self.all_raw_data[id][-1][0] == self.unique_timestamps[-(right_padding+1)]) padding_required.append([left_padding, right_padding]) return padding_required def get_data_extend_missing_internal(self, id: str): # This function does't pad the left or right of the data, but it will fill in any missing data # using the previous value timestamp_indexed_data = self.timestamp_indexed_data[id] timestamps_in_this_data = set(timestamp_indexed_data.keys()) missing_timestamps = self.unique_timestamps - timestamps_in_this_data if len(missing_timestamps) > 0: for timestamp in missing_timestamps: entry_index = timestamp_indexed_data.bisect_right(timestamp) if entry_index != 0 and entry_index < len(timestamp_indexed_data): # only pad in the middle of the data and not at the end current_padded_value = timestamp_indexed_data.peekitem(entry_index - 1)[1] timestamp_indexed_data[timestamp] = current_padded_value if isinstance(timestamp_indexed_data.peekitem(0)[1], list) or isinstance(timestamp_indexed_data.peekitem(0)[1], tuple): to_return = [[x[0], *x[1]]for x in timestamp_indexed_data.items()] else: to_return = list(timestamp_indexed_data.items()) return to_return def get_padded_data_in_sync(self, padding_val = "extend"): # It will always pad missing values in the middle or end of the data by extending the previous value. # The padding_val variable determined how to pad the beginning when there is no value before it. padded_timestamp_indexed_data = {} for ric, timestamp_indexed_data in self.timestamp_indexed_data.items(): padded_timestamp_indexed_data[ric] = timestamp_indexed_data timestamps_in_this_data = set(timestamp_indexed_data.keys()) missing_timestamps = self.unique_timestamps - timestamps_in_this_data if len(missing_timestamps) > 0: for timestamp in missing_timestamps: entry_index = padded_timestamp_indexed_data[ric].bisect_right(timestamp) if entry_index == 0: if padding_val == 'extend': current_padded_value = padded_timestamp_indexed_data[ric].peekitem(entry_index)[1] else: current_padded_value = padding_val else: current_padded_value = padded_timestamp_indexed_data[ric].peekitem(entry_index-1)[1] padded_timestamp_indexed_data[ric][timestamp] = current_padded_value return padded_timestamp_indexed_data def get_start_and_end_index_for_concat_data(self, keys): start_stop = [] current_position = 0 for id in keys: if id in self.timestamp_indexed_data: length_of_data = len(self.timestamp_indexed_data[id]) start_stop.append([current_position,current_position+length_of_data]) current_position = length_of_data else: print("warning: tried to concat data for keys {} but key {} is missing".format(keys, id)) return start_stop def concat_data_unpadded(self, keys, as_numpy = True, with_timestamps = True): data_to_concat = [] for id in keys: if id in self.timestamp_indexed_data: if with_timestamps: data_to_concat.append(np.squeeze(self.timestamp_indexed_data[id].items()[:])) else: data_to_concat.append(np.squeeze(self.timestamp_indexed_data[id].values()[:])) else: print("warning: tried to concat data for keys {} but key {} is missing".format(keys, id)) if as_numpy: return np.concatenate(data_to_concat) else: return np.concatenate(data_to_concat).tolist()
class MySortedSet: """Custom class to abstract redis sorted sets""" def __init__(self): self.members = SortedSet(key=self.sortedset_key) self.scoremap = {} def sortedset_key(self, x): return self.scoremap[x] def update(self, iterable, ch_flag=False): # Emulates set update scores, members = zip(*iterable) exist_count = 0 for ikey, key in enumerate(members): if key in self.scoremap: if ch_flag: if self.scoremap[key] == scores[ikey]: exist_count += 1 else: exist_count += 1 self.scoremap[key] = scores[ikey] for member in members: try: self.members.remove(member) except KeyError: continue except ValueError: continue self.members.update(members) return len(members) - exist_count def incr_update(self, iterable): # Increments the scores for already existing keys scores, members = zip(*iterable) for ikey, key in enumerate(members): if key in self.scoremap: self.scoremap[key] += scores[ikey] else: self.scoremap[key] = scores[ikey] for member in members: try: self.members.remove(member) except KeyError: continue except ValueError: continue self.members.update(members) return self.scoremap[members[-1]] def rank(self, member): try: return self.members.index(member) except KeyError: return '(nil)' def range(self, start, end, withscores): range_members = self.members[start:end] if withscores: range_scores = [self.scoremap[member] for member in range_members] return list(zip(range_members, range_scores)) else: return range_members