Ejemplo n.º 1
0
def CollectAliasesByAddress(elf_path, tool_prefix):
  """Runs nm on |elf_path| and returns a dict of address->[names]"""
  # Constructors often show up twice, so use sets to ensure no duplicates.
  names_by_address = collections.defaultdict(set)

  # About 60mb of output, but piping takes ~30s, and loading it into RAM
  # directly takes 3s.
  args = [path_util.GetNmPath(tool_prefix), '--no-sort', '--defined-only',
          elf_path]
  output = subprocess.check_output(args)
  for line in output.splitlines():
    space_idx = line.find(' ')
    address_str = line[:space_idx]
    section = line[space_idx + 1]
    mangled_name = line[space_idx + 3:]

    # To verify that rodata does not have aliases:
    #   nm --no-sort --defined-only libchrome.so > nm.out
    #   grep -v '\$' nm.out | grep ' r ' | sort | cut -d' ' -f1 > addrs
    #   wc -l < addrs; uniq < addrs | wc -l
    if section not in 'tTW' or not _IsRelevantNmName(mangled_name):
      continue

    address = int(address_str, 16)
    if not address:
      continue
    names_by_address[address].add(mangled_name)

  # Demangle all names.
  names_by_address = demangle.DemangleSetsInDicts(names_by_address, tool_prefix)

  # Since this is run in a separate process, minimize data passing by returning
  # only aliased symbols.
  # Also: Sort to ensure stable ordering.
  return {k: sorted(v) for k, v in names_by_address.iteritems() if len(v) > 1}
Ejemplo n.º 2
0
def RunNmOnIntermediates(target, tool_prefix, output_directory):
    """Returns encoded_symbol_names_by_path, encoded_string_addresses_by_path.

  Args:
    target: Either a single path to a .a (as a string), or a list of .o paths.
  """
    is_archive = isinstance(target, str)
    args = [path_util.GetNmPath(tool_prefix), '--no-sort', '--defined-only']
    if is_archive:
        args.append(target)
    else:
        args.extend(target)
    # pylint: disable=unexpected-keyword-arg
    proc = subprocess.Popen(args,
                            cwd=output_directory,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                            encoding='utf-8')
    # llvm-nm can print 'no symbols' to stderr. Capture and count the number of
    # lines, to be returned to the caller.
    stdout, stderr = proc.communicate()
    assert proc.returncode == 0, 'NM failed: ' + ' '.join(args)
    num_no_symbols = len(stderr.splitlines())
    lines = stdout.splitlines()
    # Empty .a file has no output.
    if not lines:
        return parallel.EMPTY_ENCODED_DICT, parallel.EMPTY_ENCODED_DICT
    is_multi_file = not lines[0]
    lines = iter(lines)
    if is_multi_file:
        next(lines)
        path = next(lines)[:-1]  # Path ends with a colon.
    else:
        assert not is_archive
        path = target[0]

    symbol_names_by_path = {}
    string_addresses_by_path = {}
    while path:
        if is_archive:
            # E.g. foo/bar.a(baz.o)
            path = '%s(%s)' % (target, path)

        mangled_symbol_names, string_addresses = _ParseOneObjectFileNmOutput(
            lines)
        symbol_names_by_path[path] = mangled_symbol_names
        if string_addresses:
            string_addresses_by_path[path] = string_addresses
        path = next(lines, ':')[:-1]

    # The multiprocess API uses pickle, which is ridiculously slow. More than 2x
    # faster to use join & split.
    # TODO(agrieve): We could use path indices as keys rather than paths to cut
    #     down on marshalling overhead.
    return (parallel.EncodeDictOfLists(symbol_names_by_path),
            parallel.EncodeDictOfLists(string_addresses_by_path),
            num_no_symbols)
Ejemplo n.º 3
0
def _RunNmOnIntermediates(target, tool_prefix, output_directory):
    """Returns encoded_symbol_names_by_path, encoded_string_addresses_by_path.

  Args:
    target: Either a single path to a .a (as a string), or a list of .o paths.
  """
    is_archive = isinstance(target, basestring)
    args = [
        path_util.GetNmPath(tool_prefix), '--no-sort', '--defined-only',
        '--demangle'
    ]
    if is_archive:
        args.append(target)
    else:
        args.extend(target)
    output = subprocess.check_output(args, cwd=output_directory)
    lines = output.splitlines()
    # Empty .a file has no output.
    if not lines:
        return concurrent.EMPTY_ENCODED_DICT, concurrent.EMPTY_ENCODED_DICT
    is_multi_file = not lines[0]
    lines = iter(lines)
    if is_multi_file:
        next(lines)
        path = next(lines)[:-1]  # Path ends with a colon.
    else:
        assert not is_archive
        path = target[0]

    string_addresses_by_path = {}
    symbol_names_by_path = {}
    while path:
        if is_archive:
            # E.g. foo/bar.a(baz.o)
            path = '%s(%s)' % (target, path)

        string_addresses, symbol_names = _ParseOneObjectFileNmOutput(lines)
        symbol_names_by_path[path] = symbol_names
        if string_addresses:
            string_addresses_by_path[path] = string_addresses
        path = next(lines, ':')[:-1]

    # The multiprocess API uses pickle, which is ridiculously slow. More than 2x
    # faster to use join & split.
    # TODO(agrieve): We could use path indices as keys rather than paths to cut
    #     down on marshalling overhead.
    return (concurrent.EncodeDictOfLists(symbol_names_by_path),
            concurrent.EncodeDictOfLists(string_addresses_by_path))
Ejemplo n.º 4
0
def CollectAliasesByAddress(elf_path, tool_prefix):
  """Runs nm on |elf_path| and returns a dict of address->[names]"""
  # Constructors often show up twice, so use sets to ensure no duplicates.
  names_by_address = collections.defaultdict(set)

  # Many OUTLINED_FUNCTION_* entries can coexist on a single address, possibly
  # mixed with regular symbols. However, naively keeping these is bad because:
  # * OUTLINED_FUNCTION_* can have many duplicates. Keeping them would cause
  #   false associations downstream, when looking up object_paths from names.
  # * For addresses with multiple OUTLINED_FUNCTION_* entries, we can't get the
  #   associated object_path (exception: the one entry in the .map file, for LLD
  #   without ThinLTO). So keeping copies around is rather useless.
  # Our solution is to merge OUTLINED_FUNCTION_* entries at the same address
  # into a single symbol. We'd also like to keep track of the number of copies
  # (although it will not be used to compute PSS computation). This is done by
  # writing the count in the name, e.g., '** outlined function * 5'.
  num_outlined_functions_at_address = collections.Counter()

  # About 60mb of output, but piping takes ~30s, and loading it into RAM
  # directly takes 3s.
  args = [path_util.GetNmPath(tool_prefix), '--no-sort', '--defined-only',
          elf_path]
  # pylint: disable=unexpected-keyword-arg
  proc = subprocess.Popen(
      args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8')
  # llvm-nm may write to stderr. Discard to denoise.
  stdout, _ = proc.communicate()
  assert proc.returncode == 0
  for line in stdout.splitlines():
    space_idx = line.find(' ')
    address_str = line[:space_idx]
    section = line[space_idx + 1]
    mangled_name = line[space_idx + 3:]

    # To verify that rodata does not have aliases:
    #   nm --no-sort --defined-only libchrome.so > nm.out
    #   grep -v '\$' nm.out | grep ' r ' | sort | cut -d' ' -f1 > addrs
    #   wc -l < addrs; uniq < addrs | wc -l
    if section not in 'tTW' or not _IsRelevantNmName(mangled_name):
      continue

    address = int(address_str, 16)
    if not address:
      continue
    if mangled_name.startswith('OUTLINED_FUNCTION_'):
      num_outlined_functions_at_address[address] += 1
    else:
      names_by_address[address].add(mangled_name)

  # Need to add before demangling because |names_by_address| changes type.
  for address, count in num_outlined_functions_at_address.items():
    name = '** outlined function' + (' * %d' % count if count > 1 else '')
    names_by_address[address].add(name)

  # Demangle all names.
  names_by_address = demangle.DemangleSetsInDicts(names_by_address, tool_prefix)

  # Since this is run in a separate process, minimize data passing by returning
  # only aliased symbols.
  # Also: Sort to ensure stable ordering.
  return {
      addr: sorted(names, key=lambda n: (n.startswith('**'), n))
      for addr, names in names_by_address.items()
      if len(names) > 1 or num_outlined_functions_at_address.get(addr, 0) > 1
  }
Ejemplo n.º 5
0
def CreateUniqueSymbols(elf_path, tool_prefix, section_ranges):
    """Creates symbols from nm --print-size output.

  Creates only one symbol for each address (does not create symbol aliases).
  """
    # Filter to sections we care about and sort by (address, size).
    section_ranges = [
        x for x in section_ranges.items() if x[0] in models.NATIVE_SECTIONS
    ]
    section_ranges.sort(key=lambda x: x[1])
    min_address = section_ranges[0][1][0]
    max_address = sum(section_ranges[-1][1])

    args = [
        path_util.GetNmPath(tool_prefix), '--no-sort', '--defined-only',
        '--print-size', elf_path
    ]
    # pylint: disable=unexpected-keyword-arg
    stdout = subprocess.check_output(args,
                                     stderr=subprocess.DEVNULL,
                                     encoding='utf-8')
    lines = stdout.splitlines()
    logging.debug('Parsing %d lines of output', len(lines))
    symbols_by_address = {}
    # Example 32-bit output:
    # 00857f94 00000004 t __on_dlclose_late
    # 000001ec r ndk_build_number
    for line in lines:
        tokens = line.split(' ', 3)
        num_tokens = len(tokens)
        if num_tokens < 3:
            # Address with no size and no name.
            continue
        address_str = tokens[0]
        # Check if size is omitted (can happen with binutils but not llvm).
        if num_tokens == 3:
            size_str = '0'
            section = tokens[1]
            mangled_name = tokens[2]
        else:
            size_str = tokens[1]
            section = tokens[2]
            mangled_name = tokens[3]

        if section not in 'BbDdTtRrWw' or not _IsRelevantNmName(mangled_name):
            continue

        address = int(address_str, 16)

        # Ignore symbols outside of sections that we care about.
        # Symbols can still exist in sections that we do not care about if those
        # sections are interleaved. We discard such symbols in the next loop.
        if not min_address <= address < max_address:
            continue

        # Pick the alias that defines a size.
        existing_alias = symbols_by_address.get(address)
        if existing_alias and existing_alias.size > 0:
            continue

        size = int(size_str, 16)

        # E.g.: .str.2.llvm.12282370934750212
        if mangled_name.startswith('.str.'):
            mangled_name = models.STRING_LITERAL_NAME
        elif mangled_name.startswith('__ARMV7PILongThunk_'):
            # Convert thunks from prefix to suffix so that name is demangleable.
            mangled_name = mangled_name[len('__ARMV7PILongThunk_'
                                            ):] + '.LongThunk'
        elif mangled_name.startswith('__ThumbV7PILongThunk_'):
            mangled_name = mangled_name[len('__ThumbV7PILongThunk_'
                                            ):] + '.LongThunk'

        # Use address (next loop) to determine between .data and .data.rel.ro.
        section_name = None
        if section in 'Tt':
            section_name = models.SECTION_TEXT
        elif section in 'Rr':
            section_name = models.SECTION_RODATA
        elif section in 'Bb':
            section_name = models.SECTION_BSS

        # No need to demangle names since they will be demangled by
        # DemangleRemainingSymbols().
        symbols_by_address[address] = models.Symbol(section_name,
                                                    size,
                                                    address=address,
                                                    full_name=mangled_name)

    logging.debug('Sorting %d NM symbols', len(symbols_by_address))
    # Sort symbols by address.
    sorted_symbols = sorted(symbols_by_address.values(),
                            key=lambda s: s.address)

    # Assign section to symbols based on address, and size where unspecified.
    # Use address rather than nm's section character to distinguish between
    # .data.rel.ro and .data.
    logging.debug('Assigning section_name and filling in missing sizes')
    section_range_iter = iter(section_ranges)
    section_end = -1
    raw_symbols = []
    active_assembly_sym = None
    for i, sym in enumerate(sorted_symbols):
        # Move to next section if applicable.
        while sym.address >= section_end:
            section_range = next(section_range_iter)
            section_name, (section_start, section_size) = section_range
            section_end = section_start + section_size

        # Skip symbols that don't fall into a section that we care about
        # (e.g. GCC_except_table533 from .eh_frame).
        if sym.address < section_start:
            continue

        if sym.section_name and sym.section_name != section_name:
            logging.warning('Re-assigning section for %r to %s', sym,
                            section_name)
        sym.section_name = section_name

        if i + 1 < len(sorted_symbols):
            next_addr = sorted_symbols[i + 1].address
        else:
            next_addr = section_end

        # Heuristic: Discard subsequent assembly symbols (no size) that are ALL_CAPS
        # or .-prefixed, since they are likely labels within a function.
        if (active_assembly_sym and sym.size == 0
                and sym.section_name == models.SECTION_TEXT):
            if sym.full_name.startswith('.') or sym.full_name.isupper():
                active_assembly_sym.size += next_addr - sym.address
                # Triggers ~30 times for all of libchrome.so.
                logging.debug('Discarding assembly label: %s', sym.full_name)
                continue

        active_assembly_sym = sym if sym.size == 0 else None

        # For assembly symbols:
        # Add in a size when absent and guard against size overlapping next symbol.
        if active_assembly_sym or sym.end_address > next_addr:
            sym.size = next_addr - sym.address

        raw_symbols.append(sym)

    return raw_symbols