def walk_binary(binr): if type(binr) == str: with open(binr, 'rb') as fd: binr = fd.read() # Search for: # ".proto" or ".protodevel", as part of the "name" (1) field cursor = 0 while cursor < len(binr): cursor = binr.find(b'.proto', cursor) if cursor == -1: break cursor += len('.proto') cursor += (binr[cursor:cursor + 5] == b'devel') * 5 # Search back for the (1, length-delimited) marker start = binr.rfind(b'\x0a', max(cursor - 1024, 0), cursor) if start > 0 and binr[start - 1] == 0x0a == (cursor - start - 1): start -= 1 # Check whether length byte is coherent if start == -1: continue varint, end = _DecodeVarint(binr, start + 1) if cursor - end != varint: continue # Look just after for subsequent markers tags = b'\x12\x1a\x22\x2a\x32\x3a\x42\x4a\x50\x58\x62' if binr[cursor] not in tags: continue while cursor < len(binr) and binr[cursor] in tags: tags = tags[tags.index(binr[cursor]):] varint, end = _DecodeVarint(binr, cursor + 1) cursor = end + varint * (binr[cursor] & 0b111 == 2) # Parse descriptor proto = FileDescriptorProto() proto.ParseFromString(binr[start:cursor]) # Convert to ascii yield descpb_to_proto(proto)
def nest_and_print_to_files(msg_path_to_obj, msg_to_referrers): msg_to_topmost = {} msg_to_newloc = {} newloc_to_msg = {} msg_to_imports = defaultdict(list) for msg, referrers in msg_to_referrers.items(): for _, referrer, _ in referrers: msg_to_imports[referrer].append(msg) # Iterate over referred to messages/groups/enums. # Merge groups first: msg_to_referrers = OrderedDict( sorted(msg_to_referrers.items(), key=lambda x: -x[1][0][2])) mergeable = {} enumfield_to_enums = defaultdict(set) enum_to_dupfields = defaultdict(set) for msg, referrers in dict(msg_to_referrers).items(): msg_pkg = get_pkg(msg) msg_obj = msg_path_to_obj[msg] # Check for duplicate enum fields in the same package: if not isinstance(msg_obj, DescriptorProto): for enum_field in msg_obj.value: name = msg_pkg + '.' + enum_field.name enumfield_to_enums[name].add(msg) if len(enumfield_to_enums[name]) > 1: for other_enum in enumfield_to_enums[name]: enum_to_dupfields[other_enum].add(name) first_field = referrers[0] field, referrer, is_group = first_field # Check whether message/enum has exactly one reference in this # package: if not is_group: in_pkg = [(field, referrer) for field, referrer, _ in referrers \ if (get_pkg(referrer) == msg_pkg or not msg_pkg) \ and msg_to_topmost.get(referrer, referrer) != msg \ and not msg_path_to_obj[referrer].options.map_entry \ and ('$' not in msg or msg.split('.')[-1].split('$')[0] == \ referrer.split('.')[-1].split('$')[0])] if len({i for _, i in in_pkg}) != 1: # It doesn't. Keep for the next step if in_pkg: mergeable[msg] = in_pkg continue else: field, referrer = in_pkg[0] else: assert len(referrers) == 1 merge_and_rename(msg, referrer, msg_pkg, is_group, msg_to_referrers, msg_to_topmost, msg_to_newloc, msg_to_imports, msg_path_to_obj, newloc_to_msg) # Try to fix recursive (mutual) imports, and conflicting enum field names. for msg, in_pkg in mergeable.items(): duplicate_enumfields = enum_to_dupfields.get(msg, set()) for field, referrer in sorted( in_pkg, key=lambda x: msg_to_newloc.get(x[1], x[1]).count('.')): top_referrer = msg_to_topmost.get(referrer, referrer) if (msg in msg_to_imports[top_referrer] and \ top_referrer in msg_to_imports[msg] and \ msg_to_topmost.get(referrer, referrer) != msg) or \ duplicate_enumfields: merge_and_rename(msg, referrer, get_pkg(msg), False, msg_to_referrers, msg_to_topmost, msg_to_newloc, msg_to_imports, msg_path_to_obj, newloc_to_msg) break for dupfield in duplicate_enumfields: siblings = enumfield_to_enums[dupfield] siblings.remove(msg) if len(siblings) == 1: enum_to_dupfields[siblings.pop()].remove(dupfield) for msg, msg_obj in msg_path_to_obj.items(): # If we're a top-level message, enforce name transforms anyway if msg not in msg_to_topmost: new_name = msg_obj.name.split('$')[-1] new_name = new_name[0].upper() + new_name[1:] msg_pkg = get_pkg(msg) if msg_pkg: msg_pkg += '.' if new_name != msg_obj.name: while newloc_to_msg.get(msg_pkg + new_name, msg_pkg + new_name) in msg_path_to_obj and \ newloc_to_msg.get(msg_pkg + new_name, msg_pkg + new_name) not in msg_to_topmost: new_name += '_' msg_obj.name = new_name fix_naming(msg_obj, msg_pkg + new_name, msg, msg, msg_to_referrers, msg_to_topmost, msg_to_newloc, msg_to_imports, msg_path_to_obj, newloc_to_msg) # Turn messages into individual files and stringify. path_to_file = OrderedDict() path_to_defines = defaultdict(list) for msg, msg_obj in msg_path_to_obj.items(): if msg not in msg_to_topmost: path = msg.split('$')[0].replace('.', '/') + '.proto' if path not in path_to_file: path_to_file[path] = FileDescriptorProto() path_to_file[path].syntax = 'proto2' path_to_file[path].package = get_pkg(msg) path_to_file[path].name = path file_obj = path_to_file[path] for imported in msg_to_imports[msg]: import_path = imported.split('$')[0].replace('.', '/') + '.proto' if import_path != path and imported not in msg_to_topmost: if import_path not in file_obj.dependency: file_obj.dependency.append(import_path) if isinstance(msg_obj, DescriptorProto): nested = file_obj.message_type.add() else: nested = file_obj.enum_type.add() nested.MergeFrom(msg_obj) path_to_defines[path].append(msg) path_to_defines[path] += [ k for k, v in msg_to_topmost.items() if v == msg and '$map' not in k ] for path, file_obj in path_to_file.items(): name, proto = descpb_to_proto(file_obj) header_lines = ['/**', 'Messages defined in this file:\n'] header_lines += path_to_defines[path] yield name, '\n * '.join(header_lines) + '\n */\n\n' + proto