Example #1
0
def suffix_reverse_match_prepare_collection(suffix_collection, separator=""):
    '''
    Prepare a collection of strings for efficient suffix matching.
    If specified, the separator is also required before the suffix.
    For example, domain suffixes use "." as a separator between components.

    Returns an object that can be passed to suffix_match().
    This object must be treated as opaque and read-only.
    '''
    assert suffix_collection is not None
    assert separator is not None
    # A binary search is efficient, even if it does double the RAM
    # requirement. And it's unlikely we could get rid of all the references to
    # the strings in the collection after reversing them, anyway.

    suffix_obj = suffix_reverse_match_collate_collection(
        suffix_collection, separator)
    # This takes about 20 seconds for the Alexa Top 1 million, and only finds
    # 239 duplicates. So maybe it's not worth doing.
    #suffix_match_uniquify_collection(suffix_obj, separator)

    # the encoded json measures transmission size, not RAM size
    logging.info("Suffix match prepared {} items ({})".format(
        len(suffix_obj), format_bytes(len(json_serialise(suffix_obj)))))

    return suffix_obj
Example #2
0
def exact_match_prepare_collection(exact_collection,
                                   existing_exacts=None,
                                   validate=True):
    '''
    Prepare a hashable object collection for efficient exact matching.
    If the objects in the collection are strings, lowercases them.

    existing_exacts is a list of previously prepared collections.

    If existing_exacts is not None, append the new collection to
    existing_exacts, as well as returning the prepared collection.

    If multiple lists are prepared using the same existing_exacts, then
    the final lists are disjoint. Any duplicate items are ignored, and a
    warning is logged. (An item that appears in multiple input lists is
    output in the earliest list it appears in.)

    If validate is True, checks that exact_match() returns True for each
    item in exact_collection.

    Returns an object that can be passed to exact_match().
    This object must be treated as opaque and read-only.
    '''
    assert exact_collection is not None
    # Set matching uses a hash table, so it's more efficient
    exact_collection_lower = [
        lower_if_hasattr(obj) for obj in exact_collection
    ]
    exact_set = frozenset(exact_collection_lower)
    # Log a message if there were any duplicates
    # Finding each duplicate takes a lot longer
    if len(exact_collection) != len(exact_set):
        dups = [
            obj for obj in exact_set
            if exact_collection_lower.count(lower_if_hasattr(obj)) > 1
        ]
        logging.warning("Removing {} duplicates within this collection".format(
            summarise_list(dups)))
    # the encoded json measures transmission size, not RAM size
    logging.info("Exact match prepared {} items ({})".format(
        len(exact_set), format_bytes(len(json_serialise(list(exact_set))))))

    # Check that each item actually matches the list
    if validate:
        for item in exact_collection:
            exact_match_validate_item(exact_set, item, exact_collection)

    if existing_exacts is None:
        return exact_set
    else:
        # Remove any items that appear in earlier lists
        disjoint_exact_set = exact_set.difference(*existing_exacts)
        duplicate_exact_set = exact_set.difference(disjoint_exact_set)
        if len(duplicate_exact_set) > 0:
            logging.warning(
                "Removing {} duplicates that are also in an earlier collection"
                .format(summarise_list(duplicate_exact_set)))
        existing_exacts.append(disjoint_exact_set)
        return disjoint_exact_set
Example #3
0
def suffix_match_prepare_collection(suffix_collection,
                                    separator="",
                                    existing_suffixes=None,
                                    collection_tag=-1,
                                    validate=True):
    '''
    Prepare a collection of strings for efficient suffix matching.
    If specified, the separator is also required before the suffix.
    For example, domain suffixes use "." as a separator between components.

    existing_suffixes is a previously prepared suffix data structure.

    If existing_suffixes is not None, add the new suffixes to
    existing_suffixes, and return existing_suffixes. Each suffix is terminated
    with collection_tag, which can be used to distinguish between suffixes
    from different lists. collection_tag must be a non-dict type that is not
    None.

    If multiple lists are prepared using the same existing_suffixes, then the
    final suffix data structure is disjoint. Any duplicate or longer
    suffixes are eliminated, and a warning is logged. (A suffix that appears
    in multiple input lists is output in the earliest list it appears in.
    A shorter suffix in a later list replaces any longer suffixes in earlier
    lists.)

    If validate is True, checks that suffix_match() returns True for each
    item in suffix_collection.

    Returns a tuple containing an object that can be passed to suffix_match(),
    and a boolean that is True if any duplicate domains were found.
    The object must be treated as opaque and read-only.
    '''
    assert suffix_collection is not None
    assert is_collection_tag_valid(collection_tag)
    #assert type(existing_suffixes) == dict
    # Create a tree of suffix components using nested python dicts
    # the terminal object is an empty dict
    if existing_suffixes is None:
        suffix_obj = {}
    else:
        suffix_obj = existing_suffixes
    longer_suffix_list = []
    duplicates = False
    for insert_string in suffix_collection:
        #assert type(suffix_obj) == dict
        insert_list = suffix_match_split(insert_string, separator=separator)
        prev_suffix_node = None
        suffix_node = suffix_obj
        # did we terminate the loop early due to a longer suffix?
        has_longer_suffix = False
        for insert_component in insert_list:
            # since we have stripped periods from the start and end, a double
            # dot is almost certainly a typo
            assert len(insert_component) > 0

            # we are allowed to add any child to the root
            # but we cannot add a child to an existing terminal object
            # because the existing tree contains a shorter suffix of
            # the string we are inserting
            #assert type(suffix_node) == dict
            next_suffix_node = suffix_node.get(insert_component)
            if (is_collection_tag_valid(next_suffix_node)):
                # there is an existing suffix that terminates here, and we are
                # about to insert a longer suffix. Instead, ignore the longer
                # suffix
                has_longer_suffix = True
                longer_suffix_list.append(insert_string)
                break

            # walk the tree, adding an entry for this suffix
            prev_suffix_node = suffix_node
            suffix_node = (next_suffix_node if next_suffix_node is not None
                           else suffix_node.setdefault(insert_component, {}))

        # we cannot have children in our terminal object
        # because the existing tree contains longer strings, and we are
        # a shorter suffix of all those existing strings
        if (not has_longer_suffix and not is_collection_tag_valid(suffix_node)
                and len(suffix_node) > 0):
            duplicates = True
            child_summary = summarise_list(suffix_node.keys())
            child_all = " ".join(suffix_node.keys())
            logging.warning(
                "Adding shorter suffix {} for collection {}, pruning existing children {}"
                .format(insert_string, collection_tag, child_summary))
            logging.debug(
                "Adding shorter suffix {} for collection {}, pruning existing children {}"
                .format(insert_string, collection_tag, child_all))

        # now, place (or replace) the end of the domain with the collection tag
        if not has_longer_suffix:
            #assert prev_suffix_node is not None
            prev_suffix_node[insert_component] = collection_tag

        # Now check that each item actually matches one of the lists
        # Allow the lists to have overlaps, we'll check that later
        if validate:
            suffix_match_validate_item(suffix_obj,
                                       insert_string,
                                       suffix_collection,
                                       separator=separator,
                                       expected_collection_tag=collection_tag,
                                       reject_overlapping_lists=False)

    if len(longer_suffix_list) > 0:
        duplicates = True
        suffix_summary = summarise_list(longer_suffix_list)
        suffix_all = " ".join(longer_suffix_list)
        logging.warning(
            "Suffix match for {} ignored longer suffixes {}".format(
                collection_tag, suffix_summary))
        logging.debug("Suffix match for {} ignored longer suffixes {}".format(
            collection_tag, suffix_all))

    # the encoded json measures transmission size, not RAM size
    logging.info("Suffix match {} prepared {} items ({})".format(
        collection_tag, len(suffix_collection),
        format_bytes(len(json_serialise(suffix_obj)))))

    return (suffix_obj, duplicates)
Example #4
0
def exact_match_prepare_collection(exact_collection,
                                   existing_exacts=None,
                                   validate=True,
                                   match_onion_md5=False):
    '''
    Prepare a hashable object collection for efficient exact matching.
    If the objects in the collection are strings, lowercases them.

    existing_exacts is a list of previously prepared collections.

    If existing_exacts is not None, append the new collection to
    existing_exacts, as well as returning the prepared collection.

    If multiple lists are prepared using the same existing_exacts, then
    the final lists are disjoint. Any duplicate items are ignored, and a
    warning is logged. (An item that appears in multiple input lists is
    output in the earliest list it appears in.)

    If validate is True, checks that exact_match() returns True for each
    item in exact_collection.

    If match_onion_md5 is True, also try to validate matches for:
        hashlib.md5(item + '.onion').hexdigest()
    If item is a string, it is lowercased before hashing.

    Returns an object that can be passed to exact_match().
    This object must be treated as opaque and read-only.
    '''
    assert exact_collection is not None
    onion_match = " (and onion md5)" if match_onion_md5 else ""
    # Set matching uses a hash table, so it's more efficient
    exact_collection_lower = [
        lower_if_hasattr(obj) for obj in exact_collection
    ]
    exact_set = frozenset(exact_collection_lower)
    # Log a message if there were any duplicates
    # Finding each duplicate takes a lot longer
    if len(exact_collection) != len(exact_set):
        dups = [
            obj for obj in exact_set
            if exact_collection_lower.count(lower_if_hasattr(obj)) > 1
        ]
        logging.warning(
            "Removing {} duplicates{} within this collection".format(
                summarise_list(dups), onion_match))
    # the encoded json measures transmission size, not RAM size
    logging.info("Exact match prepared {}{} items ({})".format(
        len(exact_set), onion_match,
        format_bytes(len(json_serialise(list(exact_set))))))

    # Check that each item actually matches the list
    if validate:
        for item in exact_collection:
            exact_match_validate_item(exact_set,
                                      item,
                                      exact_collection,
                                      match_onion_md5=match_onion_md5)

    if existing_exacts is None:
        return exact_set
    else:
        # Remove any items that appear in earlier lists
        #
        # If an onion address is in a different format in an earlier list,
        # this code won't remove it.
        #
        # Depending on the exact configuration:
        # - the tally server will fail to validate the lists, or
        # - the data collectors will count events for the duplicate onion
        #   address against the first list that contains the address
        #   (in any format).
        disjoint_exact_set = exact_set.difference(*existing_exacts)
        duplicate_exact_set = exact_set.difference(disjoint_exact_set)
        if len(duplicate_exact_set) > 0:
            logging.warning(
                "Removing {} duplicates{} that are also in an earlier collection"
                .format(summarise_list(duplicate_exact_set), onion_match))
        existing_exacts.append(disjoint_exact_set)
        return disjoint_exact_set