Beispiel #1
0
class Trie(ABCTrie):
    def __init__(self, data):
        chars = set()
        for key in data.keys():
            if not isinstance(key, unicode):
                raise TypeError(u"All keys must be strings")
            for char in key:
                chars.add(char)

        self._data = DATrie(u"".join(chars))
        for key, value in data.items():
            self._data[key] = value

    __init__.func_annotations = {}

    def __contains__(self, key):
        return key in self._data

    __contains__.func_annotations = {}

    def __len__(self):
        return len(self._data)

    __len__.func_annotations = {}

    def __iter__(self):
        raise NotImplementedError()

    __iter__.func_annotations = {}

    def __getitem__(self, key):
        return self._data[key]

    __getitem__.func_annotations = {}

    def keys(self, prefix=None):
        return self._data.keys(prefix)

    keys.func_annotations = {}

    def has_keys_with_prefix(self, prefix):
        return self._data.has_keys_with_prefix(prefix)

    has_keys_with_prefix.func_annotations = {}

    def longest_prefix(self, prefix):
        return self._data.longest_prefix(prefix)

    longest_prefix.func_annotations = {}

    def longest_prefix_item(self, prefix):
        return self._data.longest_prefix_item(prefix)

    longest_prefix_item.func_annotations = {}
Beispiel #2
0
    def start_tracking_deletions(self):
        """
        Starts tracking which subtrees have been deleted so that update_hwm
        can skip updates to keys that have subsequently been deleted.

        Should be paired with a call to stop_tracking_deletions() to release
        the associated tracking data structures.
        """
        _log.info("Started tracking deletions")
        self._deletion_hwms = Trie(TRIE_CHARS)
        self._latest_deletion = None
Beispiel #3
0
    def __init__(self):
        # We use a trie to track the highest etcd index at which we've seen
        # each key.  The trie implementation forces a fixed character set;
        # we explicitly allow the characters we expect and encode any others
        # that we're not expecting.
        self._hwms = Trie(TRIE_CHARS)

        # Set to a Trie while we're tracking deletions.  None otherwise.
        self._deletion_hwms = None
        # Optimization: tracks the highest etcd index at which we've seen a
        # deletion.  This allows us to skip an expensive lookup in the
        # _deletion_hwms trie for events that come after the deletion.
        self._latest_deletion = None
Beispiel #4
0
class Trie(ABCTrie):
    def __init__(self, data):
        chars = set()
        for key in data.keys():
            if not isinstance(key, unicode):
                raise TypeError(u"All keys must be strings")
            for char in key:
                chars.add(char)

        self._data = DATrie(u"".join(chars))
        for key, value in data.items():
            self._data[key] = value
    __init__.func_annotations = {}

    def __contains__(self, key):
        return key in self._data
    __contains__.func_annotations = {}

    def __len__(self):
        return len(self._data)
    __len__.func_annotations = {}

    def __iter__(self):
        raise NotImplementedError()
    __iter__.func_annotations = {}

    def __getitem__(self, key):
        return self._data[key]
    __getitem__.func_annotations = {}

    def keys(self, prefix=None):
        return self._data.keys(prefix)
    keys.func_annotations = {}

    def has_keys_with_prefix(self, prefix):
        return self._data.has_keys_with_prefix(prefix)
    has_keys_with_prefix.func_annotations = {}

    def longest_prefix(self, prefix):
        return self._data.longest_prefix(prefix)
    longest_prefix.func_annotations = {}

    def longest_prefix_item(self, prefix):
        return self._data.longest_prefix_item(prefix)
    longest_prefix_item.func_annotations = {}
Beispiel #5
0
def trie_unpickler(bytes):
    handle, path = tempfile.mkstemp()
    with file(path, 'wb') as tmp:
        tmp.write(bytes)
    try:
        with file(path, 'rb') as tmp:
            return Trie.read(tmp)
    finally:
        os.unlink(path)
Beispiel #6
0
    def __init__(self, data):
        chars = set()
        for key in data.keys():
            if not isinstance(key, text_type):
                raise TypeError("All keys must be strings")
            for char in key:
                chars.add(char)

        self._data = DATrie("".join(chars))
        for key, value in data.items():
            self._data[key] = value
Beispiel #7
0
    def __init__(self, data):
        chars = set()
        for key in data.keys():
            if not isinstance(key, text_type):
                raise TypeError("All keys must be strings")
            for char in key:
                chars.add(char)

        self._data = DATrie("".join(chars))
        for key, value in data.items():
            self._data[key] = value
Beispiel #8
0
    def start_tracking_deletions(self):
        """
        Starts tracking which subtrees have been deleted so that update_hwm
        can skip updates to keys that have subsequently been deleted.

        Should be paired with a call to stop_tracking_deletions() to release
        the associated tracking data structures.
        """
        _log.info("Started tracking deletions")
        self._deletion_hwms = Trie(TRIE_CHARS)
        self._latest_deletion = None
Beispiel #9
0
    def __init__(self):
        # We use a trie to track the highest etcd index at which we've seen
        # each key.  The trie implementation forces a fixed character set;
        # we explicitly allow the characters we expect and encode any others
        # that we're not expecting.
        self._hwms = Trie(TRIE_CHARS)

        # Set to a Trie while we're tracking deletions.  None otherwise.
        self._deletion_hwms = None
        # Optimization: tracks the highest etcd index at which we've seen a
        # deletion.  This allows us to skip an expensive lookup in the
        # _deletion_hwms trie for events that come after the deletion.
        self._latest_deletion = None
Beispiel #10
0
class Trie(ABCTrie):
    def __init__(self, data):
        chars = set()
        for key in data.keys():
            if not isinstance(key, text_type):
                raise TypeError("All keys must be strings")
            for char in key:
                chars.add(char)

        self._data = DATrie("".join(chars))
        for key, value in data.items():
            self._data[key] = value

    def __contains__(self, key):
        return key in self._data

    def __len__(self):
        return len(self._data)

    def __iter__(self):
        raise NotImplementedError()

    def __getitem__(self, key):
        return self._data[key]

    def keys(self, prefix=None):
        return self._data.keys(prefix)

    def has_keys_with_prefix(self, prefix):
        return self._data.has_keys_with_prefix(prefix)

    def longest_prefix(self, prefix):
        return self._data.longest_prefix(prefix)

    def longest_prefix_item(self, prefix):
        return self._data.longest_prefix_item(prefix)
Beispiel #11
0
class Trie(ABCTrie):
    def __init__(self, data):
        chars = set()
        for key in data.keys():
            if not isinstance(key, text_type):
                raise TypeError("All keys must be strings")
            for char in key:
                chars.add(char)

        self._data = DATrie("".join(chars))
        for key, value in data.items():
            self._data[key] = value

    def __contains__(self, key):
        return key in self._data

    def __len__(self):
        return len(self._data)

    def __iter__(self):
        raise NotImplementedError()

    def __getitem__(self, key):
        return self._data[key]

    def keys(self, prefix=None):
        return self._data.keys(prefix)

    def has_keys_with_prefix(self, prefix):
        return self._data.has_keys_with_prefix(prefix)

    def longest_prefix(self, prefix):
        return self._data.longest_prefix(prefix)

    def longest_prefix_item(self, prefix):
        return self._data.longest_prefix_item(prefix)
Beispiel #12
0
class HighWaterTracker(object):
    """
    Tracks the highest etcd index for which we've seen a particular
    etcd key.

    This class is expected to be used as follows:

    Starting with a resync, while also merging events from our watch on etcd:

    * Call start_tracking_deletions() to enable resolution between events
      and the snapshot.
    * Repeatedly call update_hwm() and store_deletion(), feeding in the
      data from the snapshot and event stream.
    * At the end of the snapshot processing, call stop_tracking_deletions()
      to discard the tracking metadata (which would otherwise grow
      indefinitely).
    * Call remove_old_keys() to find and delete any keys that have not been
      seen since before the snapshot was started, and hence must have been
      deleted before the snapshot was taken.

    While in sync:

    * feed in events with update_hwm() and store_deletion().

    At any point, if a new resync is required restart from
    "Call start_tracking_deletions()..."

    """
    def __init__(self):
        # We use a trie to track the highest etcd index at which we've seen
        # each key.  The trie implementation forces a fixed character set;
        # we explicitly allow the characters we expect and encode any others
        # that we're not expecting.
        self._hwms = Trie(TRIE_CHARS)

        # Set to a Trie while we're tracking deletions.  None otherwise.
        self._deletion_hwms = None
        # Optimization: tracks the highest etcd index at which we've seen a
        # deletion.  This allows us to skip an expensive lookup in the
        # _deletion_hwms trie for events that come after the deletion.
        self._latest_deletion = None

    def start_tracking_deletions(self):
        """
        Starts tracking which subtrees have been deleted so that update_hwm
        can skip updates to keys that have subsequently been deleted.

        Should be paired with a call to stop_tracking_deletions() to release
        the associated tracking data structures.
        """
        _log.info("Started tracking deletions")
        self._deletion_hwms = Trie(TRIE_CHARS)
        self._latest_deletion = None

    def stop_tracking_deletions(self):
        """
        Stops deletion tracking and frees up the associated resources.

        Calling this asserts that subsequent calls to update_hwm() will only
        use HWMs after any stored deletes.
        """
        _log.info("Stopped tracking deletions")
        self._deletion_hwms = None
        self._latest_deletion = None

    def update_hwm(self, key, new_mod_idx):
        """
        Updates the HWM for a key if the new value is greater than the old.
        If deletion tracking is enabled, resolves deletions so that updates
        to subtrees that have been deleted are skipped iff the deletion is
        after the update in HWM order.

        :return int|NoneType: the old HWM of the key (or the HWM at which it
                was deleted) or None if it did not previously exist.
        """
        _log.debug("Updating HWM for %s to %s", key, new_mod_idx)
        key = encode_key(key)
        if (self._deletion_hwms is not None and
                # Optimization: avoid expensive lookup if this update comes
                # after all deletions.
                new_mod_idx < self._latest_deletion):
            # We're tracking deletions, check that this key hasn't been
            # deleted.
            del_hwm = self._deletion_hwms.longest_prefix_value(key, None)
            if new_mod_idx < del_hwm:
                _log.debug("Key %s previously deleted, skipping", key)
                return del_hwm
        try:
            old_hwm = self._hwms[key]  # Trie doesn't have get().
        except KeyError:
            old_hwm = None
        if old_hwm < new_mod_idx:  # Works for None too.
            _log.debug("Key %s HWM updated to %s, previous %s", key,
                       new_mod_idx, old_hwm)
            self._hwms[key] = new_mod_idx
        return old_hwm

    def store_deletion(self, key, deletion_mod_idx):
        """
        Store that a given key (or directory) was deleted at a given HWM.
        :return: List of known keys that were deleted.  This will be the
                 leaves only when a subtree is being deleted.
        """
        _log.debug("Key %s deleted", key)
        key = encode_key(key)
        self._latest_deletion = max(deletion_mod_idx, self._latest_deletion)
        if self._deletion_hwms is not None:
            _log.debug("Tracking deletion in deletions trie")
            self._deletion_hwms[key] = deletion_mod_idx
        deleted_keys = []
        for child_key, child_mod in self._hwms.items(key):
            del self._hwms[child_key]
            deleted_keys.append(decode_key(child_key))
        _log.debug("Found %s keys deleted under %s", len(deleted_keys), key)
        return deleted_keys

    def remove_old_keys(self, hwm_limit):
        """
        Deletes and returns all keys that have HWMs less than hwm_limit.
        :return: list of keys that were deleted.
        """
        assert not self._deletion_hwms, \
            "Delete tracking incompatible with remove_old_keys()"
        _log.info("Removing keys that are older than %s", hwm_limit)
        old_keys = []
        state = datrie.State(self._hwms)
        state.walk(u"")
        it = datrie.Iterator(state)
        while it.next():
            value = it.data()
            if value < hwm_limit:
                old_keys.append(it.key())
        for old_key in old_keys:
            del self._hwms[old_key]
        _log.info("Deleted %s old keys", len(old_keys))
        return map(decode_key, old_keys)

    def __len__(self):
        return len(self._hwms)
Beispiel #13
0
 def __init__(self, trie=Trie(string.printable), untrieable={}):
   self.trie = trie
   self.untrieable = untrieable
   self.max_index = 2 ** 28
Beispiel #14
0
 def load(cls, prefix):
   trie = Trie.load(prefix + ".trie")
   with open(prefix + ".utrie", 'rb') as out_f:
     untrieable = pickle.load(out_f)
   obj = cls(trie, untrieable)
   return obj
Beispiel #15
0
# need fast trie (prefix tree) data structure. Choose from 3 libraries with (almost) compatible APIs.
if False:
    from datrie import Trie  # pip install datrie
    trie_path = 'datrie.dump'
elif False:
    from marisa_trie import Trie  # pip install marisa-trie
    trie_path = 'marisa_trie.dump'
else:
    from dawg import CompletionDAWG as Trie  # pip install dawg
    trie_path = 'dawg.dump'

if os.path.exists(trie_path):
    # here's one i built earlier
    if Trie.__module__ == 'dawg':
        trie = Trie()
        trie.load(trie_path)
    else:
        trie = Trie.load(trie_path)
else:
    dict_path = "garbled_email_dictionary.txt"
    if not os.path.exists(dict_path):
        # download code jam's dictionary
        dict_url = "https://code.google.com/codejam/contest/static/garbled_email_dictionary.txt"
        from urllib.request import urlretrieve
        urlretrieve(dict_url, dict_path)

    with open(dict_path) as f:
        words = [line.strip() for line in f.readlines()]

    if Trie.__module__ == 'datrie':
Beispiel #16
0
class HighWaterTracker(object):
    """
    Tracks the highest etcd index for which we've seen a particular
    etcd key.

    This class is expected to be used as follows:

    Starting with a resync, while also merging events from our watch on etcd:

    * Call start_tracking_deletions() to enable resolution between events
      and the snapshot.
    * Repeatedly call update_hwm() and store_deletion(), feeding in the
      data from the snapshot and event stream.
    * At the end of the snapshot processing, call stop_tracking_deletions()
      to discard the tracking metadata (which would otherwise grow
      indefinitely).
    * Call remove_old_keys() to find and delete any keys that have not been
      seen since before the snapshot was started, and hence must have been
      deleted before the snapshot was taken.

    While in sync:

    * feed in events with update_hwm() and store_deletion().

    At any point, if a new resync is required restart from
    "Call start_tracking_deletions()..."

    """
    def __init__(self):
        # We use a trie to track the highest etcd index at which we've seen
        # each key.  The trie implementation forces a fixed character set;
        # we explicitly allow the characters we expect and encode any others
        # that we're not expecting.
        self._hwms = Trie(TRIE_CHARS)

        # Set to a Trie while we're tracking deletions.  None otherwise.
        self._deletion_hwms = None
        # Optimization: tracks the highest etcd index at which we've seen a
        # deletion.  This allows us to skip an expensive lookup in the
        # _deletion_hwms trie for events that come after the deletion.
        self._latest_deletion = None

    def start_tracking_deletions(self):
        """
        Starts tracking which subtrees have been deleted so that update_hwm
        can skip updates to keys that have subsequently been deleted.

        Should be paired with a call to stop_tracking_deletions() to release
        the associated tracking data structures.
        """
        _log.info("Started tracking deletions")
        self._deletion_hwms = Trie(TRIE_CHARS)
        self._latest_deletion = None

    def stop_tracking_deletions(self):
        """
        Stops deletion tracking and frees up the associated resources.

        Calling this asserts that subsequent calls to update_hwm() will only
        use HWMs after any stored deletes.
        """
        _log.info("Stopped tracking deletions")
        self._deletion_hwms = None
        self._latest_deletion = None

    def update_hwm(self, key, new_mod_idx):
        """
        Updates the HWM for a key if the new value is greater than the old.
        If deletion tracking is enabled, resolves deletions so that updates
        to subtrees that have been deleted are skipped iff the deletion is
        after the update in HWM order.

        :return int|NoneType: the old HWM of the key (or the HWM at which it
                was deleted) or None if it did not previously exist.
        """
        _log.debug("Updating HWM for %s to %s", key, new_mod_idx)
        key = encode_key(key)
        if (self._deletion_hwms is not None and
                # Optimization: avoid expensive lookup if this update comes
                # after all deletions.
                new_mod_idx < self._latest_deletion):
            # We're tracking deletions, check that this key hasn't been
            # deleted.
            del_hwm = self._deletion_hwms.longest_prefix_value(key, None)
            if new_mod_idx < del_hwm:
                _log.debug("Key %s previously deleted, skipping", key)
                return del_hwm
        try:
            old_hwm = self._hwms[key]  # Trie doesn't have get().
        except KeyError:
            old_hwm = None
        if old_hwm < new_mod_idx:  # Works for None too.
            _log.debug("Key %s HWM updated to %s, previous %s",
                       key, new_mod_idx, old_hwm)
            self._hwms[key] = new_mod_idx
        return old_hwm

    def store_deletion(self, key, deletion_mod_idx):
        """
        Store that a given key (or directory) was deleted at a given HWM.
        :return: List of known keys that were deleted.  This will be the
                 leaves only when a subtree is being deleted.
        """
        _log.debug("Key %s deleted", key)
        key = encode_key(key)
        self._latest_deletion = max(deletion_mod_idx, self._latest_deletion)
        if self._deletion_hwms is not None:
            _log.debug("Tracking deletion in deletions trie")
            self._deletion_hwms[key] = deletion_mod_idx
        deleted_keys = []
        for child_key, child_mod in self._hwms.items(key):
            del self._hwms[child_key]
            deleted_keys.append(decode_key(child_key))
        _log.debug("Found %s keys deleted under %s", len(deleted_keys), key)
        return deleted_keys

    def remove_old_keys(self, hwm_limit):
        """
        Deletes and returns all keys that have HWMs less than hwm_limit.
        :return: list of keys that were deleted.
        """
        assert not self._deletion_hwms, \
            "Delete tracking incompatible with remove_old_keys()"
        _log.info("Removing keys that are older than %s", hwm_limit)
        old_keys = []
        state = datrie.State(self._hwms)
        state.walk(u"")
        it = datrie.Iterator(state)
        while it.next():
            value = it.data()
            if value < hwm_limit:
                old_keys.append(it.key())
        for old_key in old_keys:
            del self._hwms[old_key]
        _log.info("Deleted %s old keys", len(old_keys))
        return map(decode_key, old_keys)

    def __len__(self):
        return len(self._hwms)