def suffix_reverse_match_prepare_collection(suffix_collection, separator=""): ''' Prepare a collection of strings for efficient suffix matching. If specified, the separator is also required before the suffix. For example, domain suffixes use "." as a separator between components. Returns an object that can be passed to suffix_match(). This object must be treated as opaque and read-only. ''' assert suffix_collection is not None assert separator is not None # A binary search is efficient, even if it does double the RAM # requirement. And it's unlikely we could get rid of all the references to # the strings in the collection after reversing them, anyway. suffix_obj = suffix_reverse_match_collate_collection( suffix_collection, separator) # This takes about 20 seconds for the Alexa Top 1 million, and only finds # 239 duplicates. So maybe it's not worth doing. #suffix_match_uniquify_collection(suffix_obj, separator) # the encoded json measures transmission size, not RAM size logging.info("Suffix match prepared {} items ({})".format( len(suffix_obj), format_bytes(len(json_serialise(suffix_obj))))) return suffix_obj
def exact_match_prepare_collection(exact_collection, existing_exacts=None, validate=True): ''' Prepare a hashable object collection for efficient exact matching. If the objects in the collection are strings, lowercases them. existing_exacts is a list of previously prepared collections. If existing_exacts is not None, append the new collection to existing_exacts, as well as returning the prepared collection. If multiple lists are prepared using the same existing_exacts, then the final lists are disjoint. Any duplicate items are ignored, and a warning is logged. (An item that appears in multiple input lists is output in the earliest list it appears in.) If validate is True, checks that exact_match() returns True for each item in exact_collection. Returns an object that can be passed to exact_match(). This object must be treated as opaque and read-only. ''' assert exact_collection is not None # Set matching uses a hash table, so it's more efficient exact_collection_lower = [ lower_if_hasattr(obj) for obj in exact_collection ] exact_set = frozenset(exact_collection_lower) # Log a message if there were any duplicates # Finding each duplicate takes a lot longer if len(exact_collection) != len(exact_set): dups = [ obj for obj in exact_set if exact_collection_lower.count(lower_if_hasattr(obj)) > 1 ] logging.warning("Removing {} duplicates within this collection".format( summarise_list(dups))) # the encoded json measures transmission size, not RAM size logging.info("Exact match prepared {} items ({})".format( len(exact_set), format_bytes(len(json_serialise(list(exact_set)))))) # Check that each item actually matches the list if validate: for item in exact_collection: exact_match_validate_item(exact_set, item, exact_collection) if existing_exacts is None: return exact_set else: # Remove any items that appear in earlier lists disjoint_exact_set = exact_set.difference(*existing_exacts) duplicate_exact_set = exact_set.difference(disjoint_exact_set) if len(duplicate_exact_set) > 0: logging.warning( "Removing {} duplicates that are also in an earlier collection" .format(summarise_list(duplicate_exact_set))) existing_exacts.append(disjoint_exact_set) return disjoint_exact_set
def suffix_match_prepare_collection(suffix_collection, separator="", existing_suffixes=None, collection_tag=-1, validate=True): ''' Prepare a collection of strings for efficient suffix matching. If specified, the separator is also required before the suffix. For example, domain suffixes use "." as a separator between components. existing_suffixes is a previously prepared suffix data structure. If existing_suffixes is not None, add the new suffixes to existing_suffixes, and return existing_suffixes. Each suffix is terminated with collection_tag, which can be used to distinguish between suffixes from different lists. collection_tag must be a non-dict type that is not None. If multiple lists are prepared using the same existing_suffixes, then the final suffix data structure is disjoint. Any duplicate or longer suffixes are eliminated, and a warning is logged. (A suffix that appears in multiple input lists is output in the earliest list it appears in. A shorter suffix in a later list replaces any longer suffixes in earlier lists.) If validate is True, checks that suffix_match() returns True for each item in suffix_collection. Returns a tuple containing an object that can be passed to suffix_match(), and a boolean that is True if any duplicate domains were found. The object must be treated as opaque and read-only. ''' assert suffix_collection is not None assert is_collection_tag_valid(collection_tag) #assert type(existing_suffixes) == dict # Create a tree of suffix components using nested python dicts # the terminal object is an empty dict if existing_suffixes is None: suffix_obj = {} else: suffix_obj = existing_suffixes longer_suffix_list = [] duplicates = False for insert_string in suffix_collection: #assert type(suffix_obj) == dict insert_list = suffix_match_split(insert_string, separator=separator) prev_suffix_node = None suffix_node = suffix_obj # did we terminate the loop early due to a longer suffix? has_longer_suffix = False for insert_component in insert_list: # since we have stripped periods from the start and end, a double # dot is almost certainly a typo assert len(insert_component) > 0 # we are allowed to add any child to the root # but we cannot add a child to an existing terminal object # because the existing tree contains a shorter suffix of # the string we are inserting #assert type(suffix_node) == dict next_suffix_node = suffix_node.get(insert_component) if (is_collection_tag_valid(next_suffix_node)): # there is an existing suffix that terminates here, and we are # about to insert a longer suffix. Instead, ignore the longer # suffix has_longer_suffix = True longer_suffix_list.append(insert_string) break # walk the tree, adding an entry for this suffix prev_suffix_node = suffix_node suffix_node = (next_suffix_node if next_suffix_node is not None else suffix_node.setdefault(insert_component, {})) # we cannot have children in our terminal object # because the existing tree contains longer strings, and we are # a shorter suffix of all those existing strings if (not has_longer_suffix and not is_collection_tag_valid(suffix_node) and len(suffix_node) > 0): duplicates = True child_summary = summarise_list(suffix_node.keys()) child_all = " ".join(suffix_node.keys()) logging.warning( "Adding shorter suffix {} for collection {}, pruning existing children {}" .format(insert_string, collection_tag, child_summary)) logging.debug( "Adding shorter suffix {} for collection {}, pruning existing children {}" .format(insert_string, collection_tag, child_all)) # now, place (or replace) the end of the domain with the collection tag if not has_longer_suffix: #assert prev_suffix_node is not None prev_suffix_node[insert_component] = collection_tag # Now check that each item actually matches one of the lists # Allow the lists to have overlaps, we'll check that later if validate: suffix_match_validate_item(suffix_obj, insert_string, suffix_collection, separator=separator, expected_collection_tag=collection_tag, reject_overlapping_lists=False) if len(longer_suffix_list) > 0: duplicates = True suffix_summary = summarise_list(longer_suffix_list) suffix_all = " ".join(longer_suffix_list) logging.warning( "Suffix match for {} ignored longer suffixes {}".format( collection_tag, suffix_summary)) logging.debug("Suffix match for {} ignored longer suffixes {}".format( collection_tag, suffix_all)) # the encoded json measures transmission size, not RAM size logging.info("Suffix match {} prepared {} items ({})".format( collection_tag, len(suffix_collection), format_bytes(len(json_serialise(suffix_obj))))) return (suffix_obj, duplicates)
def exact_match_prepare_collection(exact_collection, existing_exacts=None, validate=True, match_onion_md5=False): ''' Prepare a hashable object collection for efficient exact matching. If the objects in the collection are strings, lowercases them. existing_exacts is a list of previously prepared collections. If existing_exacts is not None, append the new collection to existing_exacts, as well as returning the prepared collection. If multiple lists are prepared using the same existing_exacts, then the final lists are disjoint. Any duplicate items are ignored, and a warning is logged. (An item that appears in multiple input lists is output in the earliest list it appears in.) If validate is True, checks that exact_match() returns True for each item in exact_collection. If match_onion_md5 is True, also try to validate matches for: hashlib.md5(item + '.onion').hexdigest() If item is a string, it is lowercased before hashing. Returns an object that can be passed to exact_match(). This object must be treated as opaque and read-only. ''' assert exact_collection is not None onion_match = " (and onion md5)" if match_onion_md5 else "" # Set matching uses a hash table, so it's more efficient exact_collection_lower = [ lower_if_hasattr(obj) for obj in exact_collection ] exact_set = frozenset(exact_collection_lower) # Log a message if there were any duplicates # Finding each duplicate takes a lot longer if len(exact_collection) != len(exact_set): dups = [ obj for obj in exact_set if exact_collection_lower.count(lower_if_hasattr(obj)) > 1 ] logging.warning( "Removing {} duplicates{} within this collection".format( summarise_list(dups), onion_match)) # the encoded json measures transmission size, not RAM size logging.info("Exact match prepared {}{} items ({})".format( len(exact_set), onion_match, format_bytes(len(json_serialise(list(exact_set)))))) # Check that each item actually matches the list if validate: for item in exact_collection: exact_match_validate_item(exact_set, item, exact_collection, match_onion_md5=match_onion_md5) if existing_exacts is None: return exact_set else: # Remove any items that appear in earlier lists # # If an onion address is in a different format in an earlier list, # this code won't remove it. # # Depending on the exact configuration: # - the tally server will fail to validate the lists, or # - the data collectors will count events for the duplicate onion # address against the first list that contains the address # (in any format). disjoint_exact_set = exact_set.difference(*existing_exacts) duplicate_exact_set = exact_set.difference(disjoint_exact_set) if len(duplicate_exact_set) > 0: logging.warning( "Removing {} duplicates{} that are also in an earlier collection" .format(summarise_list(duplicate_exact_set), onion_match)) existing_exacts.append(disjoint_exact_set) return disjoint_exact_set