Beispiel #1
0
    def _new_lsh_index(self):
        """Create a new LSH from a set of Timesketch events.

        Returns:
            A tuple with an LSH (instance of datasketch.lsh.LSH) and a
            dictionary with event ID as key and minhash as value.
        """
        minhashes = {}
        lsh = MinHashLSH(self._config.threshold, self._config.num_perm)

        # Event generator for streaming Elasticsearch results.
        events = self._datastore.search_stream(
            query_string=self._config.query,
            query_filter={},
            indices=[self._config.index],
            return_fields=[self._config.field])

        with lsh.insertion_session() as lsh_session:
            for event in events:
                event_id = event['_id']
                index_name = event['_index']
                event_type = event['_type']
                event_text = event['_source'][self._config.field]

                # Insert minhash in LSH index
                key = (event_id, event_type, index_name)
                minhash = self._minhash_from_text(event_text)
                minhashes[key] = minhash
                lsh_session.insert(key, minhash)

        return lsh, minhashes
Beispiel #2
0
def new_lsh_index(events,
                  field,
                  delimiters=None,
                  num_perm=None,
                  threshold=None):
    """Create a new LSH from a set of Timesketch events.

    Args:
        events: list or an iterator of Event objects.
        field: string denoting the event field to use for the LSH.
        delimiters: list of strings used as delimiters for splitting text
            into words.
        num_perm: number of random permutation functions used by MinHash to
            be indexed.
        threshold: a float for the Jaccard similarity threshold between 0.0 and
            1.0. The initialized MinHash LSH will be optimized for the
            threshold by minizing the false positive and false negative.

    Returns:
        A tuple with an LSH (instance of datasketch.lsh.LSH) and a
        dictionary with event ID as key and minhash as value.
    """
    if delimiters is None:
        delimiters = DEFAULT_DELIMITERS
    if num_perm is None:
        num_perm = DEFAULT_PERMUTATIONS
    if threshold is None:
        threshold = DEFAULT_THRESHOLD

    minhashes = {}
    lsh = MinHashLSH(threshold, num_perm)

    with lsh.insertion_session() as lsh_session:
        for event in events:
            # Insert minhash in LSH index.
            key = (event.event_id, event.event_type, event.index_name)
            minhash = minhash_from_text(event.source[field], num_perm,
                                        delimiters)
            minhashes[key] = minhash
            lsh_session.insert(key, minhash)

    return lsh, minhashes
Beispiel #3
0
 def test_insertion_session(self):
     lsh = MinHashLSH(threshold=0.5, num_perm=16)
     m1 = MinHash(16)
     m1.update("a".encode("utf8"))
     m2 = MinHash(16)
     m2.update("b".encode("utf8"))
     data = [("a", m1), ("b", m2)]
     with lsh.insertion_session() as session:
         for key, minhash in data:
             session.insert(key, minhash)
     for t in lsh.hashtables:
         self.assertTrue(len(t) >= 1)
         items = []
         for H in t:
             items.extend(t[H])
         self.assertTrue("a" in items)
         self.assertTrue("b" in items)
     self.assertTrue("a" in lsh)
     self.assertTrue("b" in lsh)
     for i, H in enumerate(lsh.keys["a"]):
         self.assertTrue("a" in lsh.hashtables[i][H])
Beispiel #4
0
def new_lsh_index(events, field, delimiters=None, num_perm=None,
                  threshold=None):
    """Create a new LSH from a set of Timesketch events.

    Args:
        events: list or an iterator of Event objects.
        field: string denoting the event field to use for the LSH.
        delimiters: list of strings used as delimiters for splitting text
            into words.
        num_perm: number of random permutation functions used by MinHash to
            be indexed.
        threshold: a float for the Jaccard similarity threshold between 0.0 and
            1.0. The initialized MinHash LSH will be optimized for the
            threshold by minizing the false positive and false negative.

    Returns:
        A tuple with an LSH (instance of datasketch.lsh.LSH) and a
        dictionary with event ID as key and minhash as value.
    """
    if delimiters is None:
        delimiters = DEFAULT_DELIMITERS
    if num_perm is None:
        num_perm = DEFAULT_PERMUTATIONS
    if threshold is None:
        threshold = DEFAULT_THRESHOLD

    minhashes = {}
    lsh = MinHashLSH(threshold, num_perm)

    with lsh.insertion_session() as lsh_session:
        for event in events:
            # Insert minhash in LSH index.
            key = (event.event_id, event.event_type, event.index_name)
            minhash = minhash_from_text(
                event.source[field], num_perm, delimiters)
            minhashes[key] = minhash
            lsh_session.insert(key, minhash)

    return lsh, minhashes