def _RunNm(self, paths_by_type): """Calls nm to get symbols and (for non-BC files) string addresses.""" # Downstream functions rely upon .a not being grouped. batches = self._MakeBatches(paths_by_type.arch, None) # Combine object files and Bitcode files for nm. BATCH_SIZE = 50 # Arbitrarily chosen. batches.extend( self._MakeBatches(paths_by_type.obj + paths_by_type.bc, BATCH_SIZE)) results = self._DoBulkFork(nm.RunNmOnIntermediates, batches) # Names are still mangled. all_paths_by_name = self._paths_by_name total_no_symbols = 0 for encoded_syms, encoded_strs, num_no_symbols in results: total_no_symbols += num_no_symbols symbol_names_by_path = concurrent.DecodeDictOfLists(encoded_syms) for path, names in symbol_names_by_path.iteritems(): for name in names: all_paths_by_name[name].append(path) if encoded_strs != concurrent.EMPTY_ENCODED_DICT: self._encoded_string_addresses_by_path_chunks.append( encoded_strs) if total_no_symbols: logging.warn('nm found no symbols in %d objects.', total_no_symbols)
def AnalyzePaths(self, paths): def iter_job_params(): object_paths = [] for path in paths: # Note: _ResolveStringPieces relies upon .a not being grouped. if path.endswith('.a'): yield path, self._tool_prefix, self._output_directory else: object_paths.append(path) BATCH_SIZE = 50 # Chosen arbitrarily. for i in xrange(0, len(object_paths), BATCH_SIZE): batch = object_paths[i:i + BATCH_SIZE] yield batch, self._tool_prefix, self._output_directory params = list(iter_job_params()) # Order of the jobs doesn't matter since each job owns independent paths, # and our output is a dict where paths are the key. results = concurrent.BulkForkAndCall(_RunNmOnIntermediates, params) all_paths_by_name = self._paths_by_name for encoded_syms, encoded_strs in results: symbol_names_by_path = concurrent.DecodeDictOfLists(encoded_syms) for path, names in symbol_names_by_path.iteritems(): for name in names: all_paths_by_name[name].append(path) if encoded_strs != concurrent.EMPTY_ENCODED_DICT: self._encoded_string_addresses_by_path_chunks.append( encoded_strs) logging.debug('worker: AnalyzePaths() completed.')
def GetStringPositions(self): self._pipe.send((_MSG_GET_STRINGS,)) self._pipe.recv() # None logging.debug('Decoding string symbol results from forked process') result = self._pipe.recv() return [concurrent.DecodeDictOfLists(x, value_transform=_DecodePosition) for x in result]
def ResolveStringPiecesIndirect(encoded_string_addresses_by_path, string_data, tool_prefix, output_directory): string_addresses_by_path = concurrent.DecodeDictOfLists( encoded_string_addresses_by_path) # Assign |target| as archive path, or a list of object paths. any_path = next(string_addresses_by_path.iterkeys()) target = _ExtractArchivePath(any_path) if not target: target = string_addresses_by_path.keys() # Run readelf to find location of .rodata within the .o files. section_positions_by_path = _LookupStringSectionPositions( target, tool_prefix, output_directory) # Load the .rodata sections (from object files) as strings. string_sections_by_path = _ReadStringSections(target, output_directory, section_positions_by_path) def GeneratePathAndValues(): for path, object_addresses in string_addresses_by_path.iteritems(): for value in _IterStringLiterals( path, object_addresses, string_sections_by_path.get(path)): yield path, value ret = _AnnotateStringData(string_data, GeneratePathAndValues()) return [concurrent.EncodeDictOfLists(x) for x in ret]
def Get(self): assert self._process.stdin.closed logging.debug('Decoding nm results from forked process') encoded_keys_len = int(self._process.stdout.read(8), 16) encoded_keys = self._process.stdout.read(encoded_keys_len) encoded_values = self._process.stdout.read() return concurrent.DecodeDictOfLists(encoded_keys, encoded_values)
def testEncodeDictOfLists_Join_Empty(self): test_dict1 = {} test_dict2 = {} expected = {} encoded1 = concurrent.EncodeDictOfLists(test_dict1) encoded2 = concurrent.EncodeDictOfLists(test_dict2) encoded = concurrent.JoinEncodedDictOfLists([encoded1, encoded2]) decoded = concurrent.DecodeDictOfLists(encoded) self.assertEquals(expected, decoded)
def ResolveStringPieces(encoded_string_addresses_by_path, string_data, tool_prefix, output_directory): string_addresses_by_path = concurrent.DecodeDictOfLists( encoded_string_addresses_by_path) # Assign |target| as archive path, or a list of object paths. any_path = next(string_addresses_by_path.iterkeys()) target = _ExtractArchivePath(any_path) if not target: target = string_addresses_by_path.keys() # Run readelf to find location of .rodata within the .o files. section_positions_by_path = _LookupStringSectionPositions( target, tool_prefix, output_directory) # Load the .rodata sections (from object files) as strings. string_sections_by_path = _ReadStringSections( target, output_directory, section_positions_by_path) # list of elf_positions_by_path. ret = [collections.defaultdict(list) for _ in string_data] # Brute-force search of strings within ** merge strings sections. # This is by far the slowest part of AnalyzeStringLiterals(). # TODO(agrieve): Pre-process string_data into a dict of literal->address (at # least for ascii strings). for path, object_addresses in string_addresses_by_path.iteritems(): for value in _IterStringLiterals( path, object_addresses, string_sections_by_path.get(path)): first_match = -1 first_match_dict = None for target_dict, data in itertools.izip(ret, string_data): # Set offset so that it will be 0 when len(value) is added to it below. offset = -len(value) while True: offset = data.find(value, offset + len(value)) if offset == -1: break # Preferring exact matches (those following \0) over substring matches # significantly increases accuracy (although shows that linker isn't # being optimal). if offset == 0 or data[offset - 1] == '\0': break if first_match == -1: first_match = offset first_match_dict = target_dict if offset != -1: break if offset == -1: # Exact match not found, so take suffix match if it exists. offset = first_match target_dict = first_match_dict # Missing strings happen when optimization make them unused. if offset != -1: # Encode tuple as a string for easier mashalling. target_dict[path].append( str(offset) + ':' + str(len(value))) return [concurrent.EncodeDictOfLists(x) for x in ret]
def testEncodeDictOfLists_JoinMultiple(self): test_dict1 = {'key1': ['a']} test_dict2 = {'key2': ['b']} expected = {'key1': ['a'], 'key2': ['b']} encoded1 = concurrent.EncodeDictOfLists(test_dict1) encoded2 = concurrent.EncodeDictOfLists({}) encoded3 = concurrent.EncodeDictOfLists(test_dict2) encoded = concurrent.JoinEncodedDictOfLists([encoded1, encoded2, encoded3]) decoded = concurrent.DecodeDictOfLists(encoded) self.assertEquals(expected, decoded)
def ResolveStringPieces(encoded_strings_by_path, string_data): # ast.literal_eval() undoes repr() applied to strings. strings_by_path = concurrent.DecodeDictOfLists( encoded_strings_by_path, value_transform=ast.literal_eval) def GeneratePathAndValues(): for path, strings in strings_by_path.iteritems(): for value in strings: yield path, value ret = _AnnotateStringData(string_data, GeneratePathAndValues()) return [concurrent.EncodeDictOfLists(x) for x in ret]
def testAnalyzer(self): # Save global param in bcanalyzer. saved_char_width_limit = bcanalyzer._CHAR_WIDTH_LIMIT for width_limit, include_4byte_strings in [(2, False), (4, True)]: # Tweak global param in bcanalyzer. bcanalyzer._CHAR_WIDTH_LIMIT = width_limit encoded_results = bcanalyzer.RunBcAnalyzerOnIntermediates( ['test.o'], _TEST_TOOL_PREFIX, _TEST_OUTPUT_DIR) results = concurrent.DecodeDictOfLists( encoded_results, value_transform=ast.literal_eval) self.assertEquals(['test.o'], results.keys()) str_list = results['test.o'] # See mock_bcanalyzer.py for details on the C++ test file. expected = [] expected.append(_MakeString(8, ['Test1a', 0])) expected.append(_MakeString(8, ['Test1b', 0])) expected.append(_MakeString(8, ['Test2a', 0])) expected.append(_MakeString(8, ['Test2b', 0])) expected.append(_MakeString(16, ['Test3a', 0])) expected.append(_MakeString(16, ['Test3b', 0])) if include_4byte_strings: expected.append(_MakeString(32, ['Test4a', 0])) expected.append(_MakeString(32, ['Test4b', 0])) expected.append(_MakeString(8, [1, 0, 0, 1, 1, 0])) expected.append(_MakeString(8, [1, 0, 0, 1, 1, 1])) expected.append(_MakeString(8, ['Test5a', 0])) expected.append(_MakeString(8, ['Test5b', 1])) expected.append(_MakeString(16, ['Test6a', 0])) expected.append(_MakeString(16, ['Test6b', 1])) if include_4byte_strings: expected.append(_MakeString(32, ['Test7a', 0])) expected.append(_MakeString(32, ['Test7b', 1])) expected.append(_MakeString(8, ['Test8a', 0])) expected.append(_MakeString(8, ['Test8b', 0])) # Exclude |{u8a, u8b, u16a, u16b, u32a, u32b, u64a, u64b}|. # Exclude |{s8empty, s16empty, s32empty}|. expected.append(_MakeString(8, ['1a', 0])) # Exclude |zeros|, which should be in .bss section. self.assertEquals(expected, str_list) # Restore globa param in bcanalyzer. bcanalyzer._CHAR_WIDTH_LIMIT = saved_char_width_limit
def AnalyzePaths(self, paths): def iter_job_params(): object_paths = [] for path in paths: if path.endswith('.a'): yield path, self._tool_prefix, self._output_directory else: object_paths.append(path) BATCH_SIZE = 50 # Chosen arbitrarily. for i in xrange(0, len(object_paths), BATCH_SIZE): batch = object_paths[i:i + BATCH_SIZE] yield batch, self._tool_prefix, self._output_directory paths_by_name = collections.defaultdict(list) params = list(iter_job_params()) for encoded_ret in concurrent.BulkForkAndCall(_BatchCollectNames, params): names_by_path = concurrent.DecodeDictOfLists(*encoded_ret) for path, names in names_by_path.iteritems(): for name in names: paths_by_name[name].append(path) self._batches.append(paths_by_name)
def decode(encoded): return concurrent.DecodeDictOfLists(encoded, key_transform=int)
def testEncodeDictOfLists_Join_Singl(self): test_dict1 = {'key1': ['a']} encoded1 = concurrent.EncodeDictOfLists(test_dict1) encoded = concurrent.JoinEncodedDictOfLists([encoded1]) decoded = concurrent.DecodeDictOfLists(encoded) self.assertEquals(test_dict1, decoded)
def testEncodeDictOfLists_ValueTransform(self): test_dict = {'a': ['0', '1', '2'], 'b': ['3', '4']} expected = {'a': [0, 1, 2], 'b': [3, 4]} encoded = concurrent.EncodeDictOfLists(test_dict) decoded = concurrent.DecodeDictOfLists(encoded, value_transform=int) self.assertEquals(expected, decoded)
def testEncodeDictOfLists_KeyTransform(self): test_dict = {0: ['a', 'b', 'c'], 9: ['a', 'b']} encoded = concurrent.EncodeDictOfLists(test_dict, key_transform=str) decoded = concurrent.DecodeDictOfLists(encoded, key_transform=int) self.assertEquals(test_dict, decoded)
def testEncodeDictOfLists_AllStrings(self): test_dict = {'foo': ['a', 'b', 'c'], 'foo2': ['a', 'b']} encoded = concurrent.EncodeDictOfLists(test_dict) decoded = concurrent.DecodeDictOfLists(encoded) self.assertEquals(test_dict, decoded)
def GetStringPositions(self): return [ concurrent.DecodeDictOfLists(x, value_transform=_DecodePosition) for x in self._list_of_encoded_elf_string_positions_by_path ]
def GetSymbolNames(self): self._pipe.send((_MSG_GET_SYMBOL_NAMES, )) self._pipe.recv() # None logging.debug('Decoding nm results from forked process') encoded_paths_by_name = self._pipe.recv() return concurrent.DecodeDictOfLists(encoded_paths_by_name)
def testEncodeDictOfLists_EmptyValue(self): test_dict = {'foo': []} encoded = concurrent.EncodeDictOfLists(test_dict) decoded = concurrent.DecodeDictOfLists(encoded) self.assertEquals(test_dict, decoded)