def test_retransform_message_indices(self): sync_ends = np.array([12, 12, 12, 14, 14]) rng = CommonRange(0, 8, "1" * 8, score=1, field_type="length", message_indices={0, 1, 2, 3, 4}) retransformed_ranges = FormatFinder.retransform_message_indices( [rng], [0, 1, 2, 3, 4], sync_ends) # two different sync ends self.assertEqual(len(retransformed_ranges), 2) expected1 = CommonRange(12, 8, "1" * 8, score=1, field_type="length", message_indices={0, 1, 2}) expected2 = CommonRange(14, 8, "1" * 8, score=1, field_type="length", message_indices={3, 4}) self.assertIn(expected1, retransformed_ranges) self.assertIn(expected2, retransformed_ranges)
def test_create_message_types_2(self): rng1 = CommonRange(0, 8, "1" * 8, score=1, field_type="Length") rng1.message_indices = {0, 2, 4, 6, 8, 12} rng2 = CommonRange(8, 8, "1" * 8, score=1, field_type="Address") rng2.message_indices = {1, 2, 3, 4, 5, 12} rng3 = CommonRange(16, 8, "1" * 8, score=1, field_type="Seq") rng3.message_indices = {1, 3, 5, 7, 12} message_types = FormatFinder.create_common_range_containers( {rng1, rng2, rng3}) expected1 = CommonRangeContainer([rng1], message_indices={0, 6, 8}) expected2 = CommonRangeContainer([rng1, rng2], message_indices={2, 4}) expected3 = CommonRangeContainer([rng1, rng2, rng3], message_indices={12}) expected4 = CommonRangeContainer([rng2, rng3], message_indices={1, 3, 5}) expected5 = CommonRangeContainer([rng3], message_indices={7}) self.assertEqual(len(message_types), 5) self.assertIn(expected1, message_types) self.assertIn(expected2, message_types) self.assertIn(expected3, message_types) self.assertIn(expected4, message_types) self.assertIn(expected5, message_types)
def test_handle_medium_overlapping_conflict(self): rng1 = CommonRange(8, 8, "1" * 8, score=1, field_type="Length") rng2 = CommonRange(4, 10, "1" * 8, score=0.8, field_type="Address") rng3 = CommonRange(15, 20, "1" * 8, score=1, field_type="Seq") rng4 = CommonRange(60, 80, "1" * 8, score=0.8, field_type="Type") rng5 = CommonRange(70, 90, "1" * 8, score=0.9, field_type="Data") container = CommonRangeContainer([rng1, rng2, rng3, rng4, rng5]) result = FormatFinder.handle_overlapping_conflict([container]) self.assertEqual(len(result), 1) self.assertEqual(len(result[0]), 3) self.assertIn(rng1, result[0]) self.assertIn(rng3, result[0]) self.assertIn(rng5, result[0])
def test_ensure_not_overlaps(self): test_range = CommonRange(start=4, length=8, value="12345678") self.assertEqual(test_range.end, 11) # no overlapping self.assertEqual(test_range, test_range.ensure_not_overlaps(0, 3)[0]) self.assertEqual(test_range, test_range.ensure_not_overlaps(20, 24)[0]) # overlapping on left result = test_range.ensure_not_overlaps(2, 6)[0] self.assertEqual(result.start, 6) self.assertEqual(result.end, 11) # overlapping on right result = test_range.ensure_not_overlaps(6, 14)[0] self.assertEqual(result.start, 4) self.assertEqual(result.end, 5) # full overlapping self.assertEqual(len(test_range.ensure_not_overlaps(3, 14)), 0) # overlapping in the middle result = test_range.ensure_not_overlaps(6, 9) self.assertEqual(len(result), 2) left, right = result[0], result[1] self.assertEqual(left.start, 4) self.assertEqual(left.end, 5) self.assertEqual(right.start, 10) self.assertEqual(right.end, 11)
def score_ranges(common_ranges_by_length: dict, n_gram_length: int): """ Calculate score for the common ranges :param common_ranges_by_length: :param n_gram_length: :return: """ # The window length must be smaller than common range's length # and is something like 8 in case of on 8 bit integer. # We make this generic so e.g. 4 bit integers are supported as well if n_gram_length == 8: window_lengths = [8, 16, 32, 64] else: window_lengths = [n_gram_length * i for i in range(1, 5)] scored_ranges = dict() for length in common_ranges_by_length: scored_ranges[length] = dict() for window_length in window_lengths: scored_ranges[length][window_length] = [] byteorders = ["big", "little"] if n_gram_length == 8 else ["big"] for window_length in window_lengths: for length, common_ranges in common_ranges_by_length.items(): for common_range in filter( lambda cr: cr.length >= window_length, common_ranges): bits = common_range.value rng_byte_order = "big" max_score = max_start = -1 for start in range(0, len(bits) + 1 - window_length, n_gram_length): for byteorder in byteorders: score = LengthEngine.score_bits( bits[start:start + window_length], length, position=start, byteorder=byteorder) if score > max_score: max_score = score max_start = start rng_byte_order = byteorder rng = CommonRange( common_range.start + max_start, window_length, common_range.value[max_start:max_start + window_length], score=max_score, field_type="length", message_indices=common_range.message_indices, range_type=common_range.range_type, byte_order=rng_byte_order) scored_ranges[length][window_length].append(rng) return scored_ranges
def find_common_ranges(self, alpha=0.95, range_type="bit"): """ Find all common ranges where at least alpha percent of numbers are equal :param range_type: on of bit/hex/byte :param alpha: :return: """ data_indices = np.argwhere(self.data >= alpha).flatten() if len(data_indices) < 2: return [] result = [] start, length = None, 0 for i in range(1, len(data_indices)): if start is None: start = data_indices[i - 1] length = 1 if data_indices[i] - data_indices[i - 1] == 1: length += 1 else: if length >= 2: value = self.__get_value_for_common_range(start, length) result.append( CommonRange(start, length, value, message_indices=set(self.__active_indices), range_type=range_type)) start, length = None, 0 if i == len(data_indices) - 1 and length >= 2: value = self.__get_value_for_common_range(start, length) result.append( CommonRange(start, length, value, message_indices=set(self.__active_indices), range_type=range_type)) return result
def get_preamble_and_sync(preamble_starts, preamble_lengths, sync_ends, message_type_indices): """ Get preamble and sync common ranges based on the data :type preamble_starts: np.ndarray :type preamble_lengths: np.ndarray :type sync_ends: np.ndarray :type message_type_indices: list :rtype: set of CommonRange """ assert len(preamble_starts) == len(preamble_lengths) == len(sync_ends) result = set() # type: set[CommonRange] for i in message_type_indices: preamble = CommonRange(preamble_starts[i], preamble_lengths[i], field_type="preamble", message_indices={i}) existing_preamble = next( (rng for rng in result if preamble == rng), None) if existing_preamble is not None: existing_preamble.message_indices.add(i) elif preamble_lengths[i] > 0: result.add(preamble) preamble_end = preamble_starts[i] + preamble_lengths[i] sync_end = sync_ends[i] sync = CommonRange(preamble_end, sync_end - preamble_end, field_type="synchronization", message_indices={i}) existing_sync = next((rng for rng in result if sync == rng), None) if existing_sync is not None: existing_sync.message_indices.add(i) elif sync_end - preamble_end > 0: result.add(sync) return result
def test_create_message_types_1(self): rng1 = CommonRange(0, 8, "1" * 8, score=1, field_type="Length") rng1.message_indices = {0, 1, 2} rng2 = CommonRange(8, 8, "1" * 8, score=1, field_type="Address") rng2.message_indices = {0, 1, 2} message_types = FormatFinder.create_common_range_containers( {rng1, rng2}) self.assertEqual(len(message_types), 1) expected = CommonRangeContainer([rng1, rng2], message_indices={0, 1, 2}) self.assertEqual(message_types[0], expected)
def test_handle_easy_overlapping_conflict(self): # Easy conflict: First Label has higher score rng1 = CommonRange(8, 8, "1" * 8, score=1, field_type="Length") rng1.message_indices = {0, 1, 2} rng2 = CommonRange(8, 8, "1" * 8, score=0.8, field_type="Address") rng2.message_indices = {0, 1, 2} container = CommonRangeContainer([rng1, rng2], message_indices={0, 1, 2}) result = FormatFinder.handle_overlapping_conflict([container]) self.assertEqual(len(result), 1) self.assertEqual(len(result[0]), 1) self.assertIn(rng1, result[0]) self.assertEqual(result[0].message_indices, {0, 1, 2})
def test_handle_no_overlapping_conflict(self): rng1 = CommonRange(0, 8, "1" * 8, score=1, field_type="Length") rng1.message_indices = {0, 1, 2} rng2 = CommonRange(8, 8, "1" * 8, score=1, field_type="Address") rng2.message_indices = {0, 1, 2} container = CommonRangeContainer([rng1, rng2], message_indices={0, 1, 2}) # no conflict result = FormatFinder.handle_overlapping_conflict([container]) self.assertEqual(len(result), 1) self.assertEqual(len(result[0]), 2) self.assertIn(rng1, result[0]) self.assertEqual(result[0].message_indices, {0, 1, 2}) self.assertIn(rng2, result[0])
def choose_high_scored_ranges(self, scored_ranges: dict, bitvectors_by_n_gram_length: dict, minimum_score: float): # Set for every window length the highest scored range as candidate possible_window_lengths = defaultdict(int) for length, ranges_by_window_length in scored_ranges.items(): for window_length, ranges in ranges_by_window_length.items(): try: ranges_by_window_length[window_length] = max( filter(lambda x: x.score >= minimum_score, ranges), key=lambda x: x.score) possible_window_lengths[window_length] += 1 except ValueError: ranges_by_window_length[window_length] = None try: # Choose window length -> window length that has a result most often and choose greater on tie chosen_window_length = max(possible_window_lengths, key=lambda x: (possible_window_lengths[x], x)) except ValueError: return dict() high_scores_by_length = dict() # Choose all ranges with highest score per cluster if score surpasses the minimum score for length, ranges_by_window_length in scored_ranges.items(): try: if ranges_by_window_length[chosen_window_length]: high_scores_by_length[length] = ranges_by_window_length[ chosen_window_length] except KeyError: continue # If there are length clusters with only one message see if we can assign a range from other clusters for length, msg_indices in bitvectors_by_n_gram_length.items(): if len(msg_indices) != 1: continue msg_index = msg_indices[0] bitvector = self.bitvectors[msg_index] max_score, best_match = 0, None for rng in high_scores_by_length.values(): bits = bitvector[rng.start:rng.end + 1] if len(bits) > 0: score = self.score_bits(bits, length, rng.start) if score > max_score: best_match, max_score = rng, score if best_match is not None: high_scores_by_length[length] = CommonRange( best_match.start, best_match.length, value=bitvector[best_match.start:best_match.end + 1], score=max_score, field_type="length", message_indices={msg_index}, range_type="bit") return high_scores_by_length
def _py_find_field(self, messages, verbose=False): """ :type messages: list of urh.signalprocessing.Message.Message :return: """ msg_indices_per_participant = defaultdict(list) """:type : dict[urh.signalprocessing.Participant.Participant, list[int]] """ for i, msg in enumerate(messages): msg_indices_per_participant[msg.participant].append(i) # Cluster participants equal_ranges_per_participant = defaultdict(list) """:type : dict[urh.signalprocessing.Participant.Participant, list[CommonRange]] """ alignment = 8 # Step 1: Find equal ranges for participants by evaluating the XOR matrix participant wise for participant, participant_msg_indices in msg_indices_per_participant.items(): for i, msg_index in enumerate(participant_msg_indices): msg = messages[msg_index] bitvector_str = msg.decoded_bits_str for other_index in participant_msg_indices[i+1:]: other_msg = messages[other_index] xor_vec = self.xor_matrix[msg_index, other_index][self.xor_matrix[msg_index, other_index] != -1] # -1 = End of Vector # addresses are searched across message types, as we assume them to be in almost every message # therefore we need to consider message types of both messages we compare and ignore already labeled areas unlabeled_ranges = msg.message_type.unlabeled_ranges_with_other_mt(other_msg.message_type) for rng_start, rng_end in unlabeled_ranges: start = 0 # The last 1 marks end of sequence, and prevents swallowing long zero sequences at the end cmp_vector = np.append(xor_vec[rng_start:rng_end], 1) for end in np.where(cmp_vector == 1)[0]: if end - start >= self.MIN_ADDRESS_LENGTH: equal_range_start = alignment * ((rng_start + start) // alignment) equal_range_end = alignment * ((rng_start + end) // alignment) bits = bitvector_str[equal_range_start:equal_range_end] # Did we already found this range? cr = next((cr for cr in equal_ranges_per_participant[participant] if cr.start == equal_range_start and cr.end == equal_range_end and cr.bits == bits), None) # If not: Create it if cr is None: cr = CommonRange(equal_range_start, equal_range_end, bits) equal_ranges_per_participant[participant].append(cr) cr.messages.add(msg_index) cr.messages.add(other_index) start = end + alignment if verbose: print(constants.color.BOLD + "Result after Step 1" +constants.color.END) self.__print_ranges(equal_ranges_per_participant) # Step 2: Now we want to find our address candidates. # We do this by weighting them in order of LCS they share with each other scored_candidates = self.find_candidates([cr for crl in equal_ranges_per_participant.values() for cr in crl]) """:type : dict[str, int] """ try: highscored = next(self.choose_candidate_pair(scored_candidates)) assert len(highscored[0]) == len(highscored[1]) except (StopIteration, AssertionError): return if verbose: print(scored_candidates) print(sorted(scored_candidates, key=scored_candidates.get, reverse=True)) # Now get the common_ranges we need scored_candidates_per_participant = defaultdict(list) """:type : dict[urh.signalprocessing.Participant.Participant, list[CommonRange]] """ for participant, ranges in equal_ranges_per_participant.items(): for equal_range in ranges: for h in highscored: rng = equal_range.pos_of_hex(h) if rng is not None: start, end = rng bits = equal_range.bits[start:end] rel_start = equal_range.start + start rel_end = rel_start + (end - start) cr = next((cr for cr in scored_candidates_per_participant[participant] if cr.start == rel_start and cr.end == rel_end and cr.bits == bits), None) if cr is None: cr = CommonRange(rel_start, rel_end, bits) scored_candidates_per_participant[participant].append(cr) cr.messages.update(equal_range.messages) # Now we have the highscored ranges per participant # If there is a crossmatch of the ranges we are good and found the addresses! # We have something like: # # Participant: Alice (A): Participant: Bob (B): # ======================= ===================== # # Range Value Messages Range Value Messages # ----- ----- -------- ----- ----- -------- # 72-96 1b6033 {1, 5, 9, 13, 17, 20} 72-96 78e289 {11, 3, 15, 7} # 88-112 1b6033 {2, 6, 10, 14, 18} 88-112 78e289 {4, 8, 12, 16, 19} # 112-136 78e289 {2, 6, 10, 14, 18} 112-136 1b6033 {0, 4, 8, 12, 16, 19} # # If the value doubles for the same participant in other range, then we need to create a new message type # We consider the default case (=default message type) to have addresses followed by each other # Furthermore, we assume if there is only one address per message type, it is the destination address clusters = {"default": defaultdict(set), "ack": defaultdict(set)} """:type: dict[str, dict[tuple[int.int],set[int]]]""" all_candidates = [cr for crl in scored_candidates_per_participant.values() for cr in crl] # Check for crossmatch and cluster in together and splitted addresses # Perform a merge by only saving the ranges and applying messages for candidate in sorted(all_candidates): if any(c.start == candidate.start and c.end == candidate.end and c.bits != candidate.bits for c in all_candidates): # Crossmatch! This is a address if any(c.start == candidate.end or c.end == candidate.start for c in all_candidates): clusters["default"][(candidate.start, candidate.end)].update(candidate.messages) else: clusters["ack"][(candidate.start, candidate.end)].update(candidate.messages) msg_clusters = {cname: set(i for s in ranges.values() for i in s) for cname, ranges in clusters.items()} # If there are no addresses in default message type prevent evaluating everything as ACK if not msg_clusters["default"]: msg_clusters["ack"] = set() scored_candidates_per_participant.clear() self.assign_messagetypes(messages, msg_clusters) # Now try to find the addresses of the participants to separate SRC and DST address later self.assign_participant_addresses(messages, list(scored_candidates_per_participant.keys()), highscored) for participant, ranges in scored_candidates_per_participant.items(): for rng in ranges: for msg_index in rng.messages: msg = messages[msg_index] if msg.message_type.name == "ack": field_type = self.dst_field_type name = self.dst_field_name elif msg.participant: if rng.hex_value == msg.participant.address_hex: name = self.src_field_name field_type = self.src_field_type else: name = self.dst_field_name field_type = self.dst_field_type else: name = "Address" field_type = None if not any(lbl.name == name and lbl.auto_created for lbl in msg.message_type): msg.message_type.add_protocol_label(rng.start, rng.end - 1, name=name, auto_created=True, type=field_type)
def find(self): addresses_by_participant = { p: [addr.tostring()] for p, addr in self.known_addresses_by_participant.items() } addresses_by_participant.update(self.find_addresses()) self._debug("Addresses by participant", addresses_by_participant) # Find the address candidates by participant in messages ranges_by_participant = defaultdict( list) # type: dict[int, list[CommonRange]] addresses = [ np.array(np.frombuffer(a, dtype=np.uint8)) for address_list in addresses_by_participant.values() for a in address_list ] already_labeled_cols = array( "L", [e for rng in self.already_labeled for e in range(*rng)]) # Find occurrences of address candidates in messages and create common ranges over matching positions for i, msg_vector in enumerate(self.msg_vectors): participant = self.participant_indices[i] for address in addresses: for index in awre_util.find_occurrences( msg_vector, address, already_labeled_cols): common_ranges = ranges_by_participant[participant] rng = next((cr for cr in common_ranges if cr.matches(index, address)), None) # type: CommonRange if rng is not None: rng.message_indices.add(i) else: common_ranges.append( CommonRange(index, len(address), address, message_indices={i}, range_type="hex")) num_messages_by_participant = defaultdict(int) for participant in self.participant_indices: num_messages_by_participant[participant] += 1 # Look for cross swapped values between participant clusters for p1, p2 in itertools.combinations(ranges_by_participant, 2): ranges1_set, ranges2_set = set(ranges_by_participant[p1]), set( ranges_by_participant[p2]) for rng1, rng2 in itertools.product(ranges_by_participant[p1], ranges_by_participant[p2]): if rng1 in ranges2_set and rng2 in ranges1_set: if self.cross_swap_check(rng1, rng2): rng1.score += len(rng2.message_indices ) / num_messages_by_participant[p2] rng2.score += len(rng1.message_indices ) / num_messages_by_participant[p1] elif self.ack_check(rng1, rng2): # Add previous score in divisor to add bonus to ranges that apply to all messages rng1.score += len(rng2.message_indices) / ( num_messages_by_participant[p2] + rng1.score) rng2.score += len(rng1.message_indices) / ( num_messages_by_participant[p1] + rng2.score) if len(ranges_by_participant) == 1 and not self.src_field_present: for p, ranges in ranges_by_participant.items(): for rng in sorted(ranges): try: if np.array_equal( rng.value, self.known_addresses_by_participant[p]): # Only one participant in this iteration and address already known -> Highscore rng.score = 1 break # Take only the first (leftmost) range except KeyError: pass high_scored_ranges_by_participant = defaultdict(list) address_length = self.__estimate_address_length(ranges_by_participant) # Get highscored ranges by participant for participant, common_ranges in ranges_by_participant.items(): # Sort by negative score so ranges with highest score appear first # Secondary sort by tuple to ensure order when ranges have same score sorted_ranges = sorted(filter( lambda cr: cr.score > self.minimum_score, common_ranges), key=lambda cr: (-cr.score, cr)) if len(sorted_ranges) == 0: addresses_by_participant[participant] = dict() continue addresses_by_participant[participant] = { a for a in addresses_by_participant.get(participant, []) if len(a) == address_length } for rng in filter(lambda r: r.length == address_length, sorted_ranges): rng.score = min(rng.score, 1.0) high_scored_ranges_by_participant[participant].append(rng) # Now we find the most probable address for all participants self.__assign_participant_addresses(addresses_by_participant, high_scored_ranges_by_participant) # Eliminate participants for which we could not assign an address for participant, address in addresses_by_participant.copy().items(): if address is None: del addresses_by_participant[participant] # Now we can separate SRC and DST for participant, ranges in high_scored_ranges_by_participant.items(): try: address = addresses_by_participant[participant] except KeyError: high_scored_ranges_by_participant[participant] = [] continue result = [] for rng in sorted(ranges, key=lambda r: r.score, reverse=True): rng.field_type = "source address" if rng.value.tostring( ) == address else "destination address" if len(result) == 0: result.append(rng) else: subset = next( (r for r in result if rng.message_indices.issubset(r.message_indices)), None) if subset is not None: if rng.field_type == subset.field_type: # Avoid adding same address type twice continue if rng.length != subset.length or ( rng.start != subset.end + 1 and rng.end + 1 != subset.start): # Ensure addresses are next to each other continue result.append(rng) high_scored_ranges_by_participant[participant] = result self.__find_broadcast_fields(high_scored_ranges_by_participant, addresses_by_participant) result = [ rng for ranges in high_scored_ranges_by_participant.values() for rng in ranges ] # If we did not find a SRC address, lower the score a bit, # so DST fields do not win later e.g. again length fields in case of tie if not any(rng.field_type == "source address" for rng in result): for rng in result: rng.score *= 0.95 return result
def find(self): n = self.n_gram_length if len(self.bitvectors) < 3: # We need at least 3 bitvectors to properly find a sequence number return [] diff_matrix = self.create_difference_matrix(self.bitvectors, self.n_gram_length) diff_frequencies_by_column = dict() for j in range(diff_matrix.shape[1]): unique, counts = np.unique(diff_matrix[:, j], return_counts=True) diff_frequencies_by_column[j] = dict(zip(unique, counts)) self._debug("Diff_frequencies_by_column", diff_frequencies_by_column) scores_by_column = dict() for column, frequencies in diff_frequencies_by_column.items(): if column not in self.already_labeled_cols: scores_by_column[column] = self.calc_score(frequencies) else: scores_by_column[column] = 0 self._debug("Scores by column", scores_by_column) result = [] for candidate_column in sorted(scores_by_column, key=scores_by_column.get, reverse=True): score = scores_by_column[candidate_column] if score < self.minimum_score: continue most_common_diff = self.get_most_frequent( diff_frequencies_by_column[candidate_column]) message_indices = np.flatnonzero( # get all rows that have the most common difference or zero (diff_matrix[:, candidate_column] == most_common_diff) | (diff_matrix[:, candidate_column] == 0)) # For example, index 1 in diff matrix corresponds to index 1 and 2 of messages message_indices = set(message_indices) | set(message_indices + 1) values = set() for i in message_indices: values.add( self.bitvectors[i][candidate_column * n:(candidate_column + 1) * n].tobytes()) matching_ranges = [ r for r in result if r.message_indices == message_indices ] try: matching_range = next( r for r in matching_ranges if r.start == (candidate_column - 1) * n and (r.byte_order_is_unknown or r.byte_order == "big")) matching_range.length += n matching_range.byte_order = "big" matching_range.values.extend(list(values)) continue except StopIteration: pass try: matching_range = next( r for r in matching_ranges if r.start == (candidate_column + 1) * n and ( r.byte_order_is_unknown or r.byte_order == "little")) matching_range.start -= n matching_range.length += n matching_range.byte_order = "little" matching_range.values.extend(list(values)) continue except StopIteration: pass new_range = CommonRange(start=candidate_column * n, length=n, score=score, field_type="sequence number", message_indices=message_indices, byte_order=None) new_range.values.extend(list(values)) result.append(new_range) # At least three different values needed to reliably identify a sequence number return [rng for rng in result if len(set(rng.values)) > 2]