def extract(self, member_name, dest_dir): if '..' in member_name.split('/'): raise ValueError('relative path in squashfs') cmd = ['unsquashfs', '-n', '-f', '-d', dest_dir, self.source.path, member_name] logger.debug("unsquashfs %s into %s", member_name, dest_dir) subprocess.check_call(cmd, shell=False, stdout=subprocess.PIPE) return '%s%s' % (dest_dir, member_name)
def perform_fuzzy_matching(members1, members2): if tlsh == None or Config.general.fuzzy_threshold == 0: return already_compared = set() # Perform local copies because they will be modified by consumer members1 = dict(members1) members2 = dict(members2) for name1, file1 in members1.items(): if file1.is_directory() or not file1.fuzzy_hash: continue comparisons = [] for name2, file2 in members2.items(): if name2 in already_compared or file2.is_directory( ) or not file2.fuzzy_hash: continue comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2)) if comparisons: comparisons.sort(key=operator.itemgetter(0)) score, name2 = comparisons[0] logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score) if score < Config.general.fuzzy_threshold: yield name1, name2, score already_compared.add(name2)
def output_difference(difference, print_func, css_url, directory, parents): logger.debug('html output for %s', difference.source1) sources = parents + [difference.source1] print_func(u"<div class='difference'>") try: print_func(u"<div class='diffheader'>") if difference.source1 == difference.source2: print_func(u"<div><span class='source'>%s<span>" % escape(difference.source1)) else: print_func(u"<div><span class='source'>%s</span> vs.</div>" % escape(difference.source1)) print_func(u"<div><span class='source'>%s</span>" % escape(difference.source2)) anchor = '/'.join(sources[1:]) print_func(u" <a class='anchor' href='#%s' name='%s'>\xb6</a>" % (anchor, anchor)) print_func(u"</div>") if difference.comments: print_func(u"<div class='comment'>%s</div>" % u'<br />'.join(map(escape, difference.comments))) print_func(u"</div>") if difference.unified_diff: output_unified_diff(print_func, css_url, directory, difference.unified_diff) for detail in difference.details: output_difference(detail, print_func, css_url, directory, sources) except PrintLimitReached: logger.debug('print limit reached') raise finally: print_func(u"</div>", force=True)
def path(self): if self._path is None: logger.debug('unpacking %s', self._name) assert self._temp_dir is None self._temp_dir = get_temporary_directory() self._path = self.container.extract(self._name, self._temp_dir.name) return self._path
def wait(self): if self._stdin_feeder: self._stdin_feeder.join() self._stderr_reader.join() returncode = self._process.wait() logger.debug('done with %s. exit code %d', self.cmdline()[0], returncode) return returncode
def run_diff(fd1, fd2, end_nl_q1, end_nl_q2): cmd = ['diff', '-aU7', '/dev/fd/%d' % fd1, '/dev/fd/%d' % fd2] logger.debug('running %s', cmd) if hasattr(os, 'set_inheritable'): # new in Python 3.4 os.set_inheritable(fd1, True) os.set_inheritable(fd2, True) p = subprocess.Popen(cmd, shell=False, bufsize=1, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, pass_fds=(fd1, fd2)) p.stdin.close() os.close(fd1) os.close(fd2) parser = DiffParser(p.stdout, end_nl_q1, end_nl_q2) t_read = Thread(target=parser.parse) t_read.daemon = True t_read.start() t_read.join() p.wait() logger.debug('done with diff, returncode %d, parsed %s', p.returncode, parser.success) if not parser.success and p.returncode not in (0, 1): raise subprocess.CalledProcessError(p.returncode, cmd, output=diff) if p.returncode == 0: return None return parser.diff
def compare_meta(path1, path2): logger.debug('compare_meta(%s, %s)', path1, path2) differences = [] try: differences.append(Difference.from_command(Stat, path1, path2)) except RequiredToolNotFound: logger.warn("'stat' not found! Is PATH wrong?") if os.path.islink(path1) or os.path.islink(path2): return [d for d in differences if d is not None] try: lsattr1 = lsattr(path1) lsattr2 = lsattr(path2) differences.append( Difference.from_text(lsattr1, lsattr2, path1, path2, source="lattr")) except RequiredToolNotFound: logger.info("Unable to find 'lsattr'.") try: differences.append(Difference.from_command(Getfacl, path1, path2)) except RequiredToolNotFound: logger.info("Unable to find 'getfacl'.") return [d for d in differences if d is not None]
def recognizes(file): size = os.stat(file.path).st_size if size < CBFS_HEADER_SIZE or size > CBFS_MAXIMUM_FILE_SIZE: return False with open(file.path, 'rb') as f: # pick at the latest byte as it should contain the relative offset of the header f.seek(-4, io.SEEK_END) # <pgeorgi> given the hardware we support so far, it looks like # that field is now bound to be little endian # -- #coreboot, 2015-10-14 rel_offset = struct.unpack('<i', f.read(4))[0] if rel_offset < 0 and -rel_offset > CBFS_HEADER_SIZE and -rel_offset < size: f.seek(rel_offset, io.SEEK_END) logger.debug('looking for header at offset: %x', f.tell()) if is_header_valid(f.read(CBFS_HEADER_SIZE), size): return True elif not file.name.endswith('.rom'): return False else: logger.debug('CBFS relative offset seems wrong, scanning whole image') f.seek(0, io.SEEK_SET) offset = 0 buf = f.read(CBFS_HEADER_SIZE) while len(buf) >= CBFS_HEADER_SIZE: if is_header_valid(buf, size, offset): return True if len(buf) - offset <= CBFS_HEADER_SIZE: buf = f.read(32768) offset = 0 else: offset += 1 return False
def recognizes(file): size = os.stat(file.path).st_size if size < CBFS_HEADER_SIZE or size > CBFS_MAXIMUM_FILE_SIZE: return False with open(file.path, 'rb') as f: # pick at the latest byte as it should contain the relative offset of the header f.seek(-4, io.SEEK_END) # <pgeorgi> given the hardware we support so far, it looks like # that field is now bound to be little endian # -- #coreboot, 2015-10-14 rel_offset = struct.unpack('<i', f.read(4))[0] if rel_offset < 0 and -rel_offset > CBFS_HEADER_SIZE and -rel_offset < size: f.seek(rel_offset, io.SEEK_END) logger.debug('looking for header at offset: %x', f.tell()) if is_header_valid(f.read(CBFS_HEADER_SIZE), size): return True elif not file.name.endswith('.rom'): return False else: logger.debug( 'CBFS relative offset seems wrong, scanning whole image' ) f.seek(0, io.SEEK_SET) offset = 0 buf = f.read(CBFS_HEADER_SIZE) while len(buf) >= CBFS_HEADER_SIZE: if is_header_valid(buf, size, offset): return True if len(buf) - offset <= CBFS_HEADER_SIZE: buf = f.read(32768) offset = 0 else: offset += 1 return False
def extract(self, member_name, dest_dir): dest_path = os.path.join(dest_dir, member_name) logger.debug('gzip extracting to %s', dest_path) with open(dest_path, 'wb') as fp: subprocess.check_call( ["gzip", "--decompress", "--stdout", self.source.path], shell=False, stdout=fp, stderr=None) return dest_path
def extract(self, member_name, dest_dir): dest_path = os.path.join(dest_dir, member_name) logger.debug('dex extracting to %s', dest_path) subprocess.check_call(['enjarify', '-o', dest_path, self.source.path], shell=False, stderr=None, stdout=subprocess.PIPE) return dest_path
def extract(self, member_name, dest_dir): dest_path = os.path.join(dest_dir, member_name) logger.debug('xz extracting to %s', dest_path) with open(dest_path, 'wb') as fp: subprocess.check_call( ["xz", "--decompress", "--stdout", self.source.path], shell=False, stdout=fp, stderr=None) return dest_path
def md5sums(self): if not hasattr(self, '_md5sums'): md5sums_file = self.as_container.control_tar.as_container.lookup_file('./md5sums') if md5sums_file: self._md5sums = md5sums_file.parse() else: logger.debug('Unable to find a md5sums file') self._md5sums = {} return self._md5sums
def get_reverse(self): if self._unified_diff is None: unified_diff = None else: unified_diff = reverse_unified_diff(self._unified_diff) logger.debug('reverse orig %s %s', self._source1, self._source2) difference = Difference(unified_diff, None, None, source=[self._source2, self._source1], comment=self._comments) difference.add_details([d.get_reverse() for d in self._details]) return difference
def as_container(self): if not hasattr(self.__class__, 'CONTAINER_CLASS'): if hasattr(self, '_other_file'): return self._other_file.__class__.CONTAINER_CLASS(self) return None if not hasattr(self, '_as_container'): logger.debug('instanciating %s for %s', self.__class__.CONTAINER_CLASS, self) self._as_container = self.__class__.CONTAINER_CLASS(self) logger.debug('returning a %s for %s', self._as_container.__class__, self) return self._as_container
def _should_skip_section(name, type): for cmd in READELF_COMMANDS: if cmd.should_skip_section(name, type): logger.debug('skipping section %s, covered by %s', name, cmd) return True if name.startswith('.debug') or name.startswith('.zdebug'): # section .debug_str looks much nicer with `readelf --string-dump` # the rest is handled by READELF_DEBUG_DUMP_COMMANDS return not name.endswith('_str') return False
def extract(self, member_name, dest_dir): dest_path = os.path.join(dest_dir, member_name) logger.debug('rust-object extracting to %s', dest_path) # See librustc_trans/back/link.rs for details of this format with open(dest_path, 'wb') as fpw, open(self.source.path, 'rb') as fpr: raw_deflate = fpr.read()[RLIB_BYTECODE_OBJECT_V1_DATA_OFFSET:] # decompressobj() ignores the (non-existent) checksum; a zlib.decompress() would error raw_inflate = zlib.decompressobj().decompress(ZLIB_DEFAULT_COMPRESSION + raw_deflate) fpw.write(raw_inflate) return dest_path
def get_build_id(path): try: output = subprocess.check_output(['readelf', '--notes', path]) except subprocess.CalledProcessError as e: logger.debug('Unable to get Build Id for %s: %s', path, e) return None m = re.search(r'^\s+Build ID: ([0-9a-f]+)$', output.decode('utf-8'), flags=re.MULTILINE) if not m: return None return m.group(1)
def md5sums(self): if not hasattr(self, '_md5sums'): md5sums_file = self.as_container.lookup_file( 'control.tar.gz', 'control.tar', './md5sums') if md5sums_file: self._md5sums = md5sums_file.parse() else: logger.debug('Unable to find a md5sums file') self._md5sums = {} return self._md5sums
def get_content(self): logger.debug('%s get_content; path %s', self, self._path) if self._path is not None: yield else: with make_temp_directory() as temp_dir, \ self._container.open() as container: self._path = container.extract(self._name, temp_dir) yield self._path = None
def get_debug_link(path): try: output = subprocess.check_output(['readelf', '--string-dump=.gnu_debuglink', path]) except subprocess.CalledProcessError as e: logger.debug('Unable to get Build Id for %s: %s', path, e) return None m = re.search(r'^\s+\[\s+0\]\s+(\S+)$', output.decode('utf-8', errors='replace'), flags=re.MULTILINE) if not m: return None return m.group(1)
def compare(self, other, source=None): # So now that comparators are all object-oriented, we don't have any clue on how to # perform a meaningful comparison right here. So we are good do the comparison backward # (where knowledge of the file format lies) and and then reverse it. if isinstance(other, NonExistingFile): return Difference(None, self.name, other.name, comment='Trying to compare two non-existing files.') logger.debug('Performing backward comparison') backward_diff = other.compare(self, source) if not backward_diff: return None return backward_diff.get_reverse()
def validate_checksums(self, check_hash="sha1"): """ Validate checksums for a package, using ``check_hack``'s type to validate the package. Valid ``check_hash`` types: * sha1 * sha256 * md5 * md5sum """ logger.debug("validating %s checksums", check_hash) for filename in self.get_files(): if check_hash == "sha1": hash_type = hashlib.sha1() checksums = self.get("Checksums-Sha1") field_name = "sha1" elif check_hash == "sha256": hash_type = hashlib.sha256() checksums = self.get("Checksums-Sha256") field_name = "sha256" elif check_hash == "md5": hash_type = hashlib.md5() checksums = self.get("Files") field_name = "md5sum" changed_files = None # appease pylint for changed_files in checksums: if changed_files['name'] == os.path.basename(filename): break else: assert( "get_files() returns different files than Files: knows?!") with open(os.path.join(self._directory, filename), "rb") as fc: while True: chunk = fc.read(131072) if not chunk: break hash_type.update(chunk) fc.close() if not hash_type.hexdigest() == changed_files[field_name]: raise ChangesFileException( "Checksum mismatch for file %s: %s != %s" % ( filename, hash_type.hexdigest(), changed_files[field_name] )) else: logger.debug("%s Checksum for file %s matches", field_name, filename)
def parse(self): try: md5sums = {} with open(self.path, 'r', encoding='utf-8') as f: for line in f: md5sum, path = re.split(r'\s+', line.strip(), maxsplit=1) md5sums['./%s' % path] = md5sum return md5sums except (UnicodeDecodeError, ValueError): logger.debug('Malformed md5sums, ignoring.') return {}
def compare(self, other, source=None): differences = super().compare(other, source) details = None try: details = Difference.from_command(Pstotext, self.path, other.path) except RequiredToolNotFound: logger.debug('ps2ascii not found') if details: differences.add_details([details]) return differences
def has_same_content_as(self, other): logger.debug('%s has_same_content %s', self, other) # try comparing small files directly first my_size = os.path.getsize(self.path) other_size = os.path.getsize(other.path) if my_size == other_size and my_size <= SMALL_FILE_THRESHOLD: if open(self.path, 'rb').read() == open(other.path, 'rb').read(): return True return 0 == subprocess.call(['cmp', '-s', self.path, other.path], shell=False, close_fds=True)
def output_html(difference, css_url=None, print_func=None): if print_func is None: print_func = print print_func = create_limited_print_func(print_func, Config.general.max_report_size) try: output_header(css_url, print_func) output_difference(difference, print_func, []) except PrintLimitReached: logger.debug('print limit reached') print_func(u"<div class='error'>Max output size reached.</div>", force=True) print_func(FOOTER % {'version': VERSION}, force=True)
def extract(self, member_name, dest_dir): dest_path = os.path.join(dest_dir, os.path.basename(member_name)) cmd = [ 'cbfstool', self.source.path, 'extract', '-n', member_name, '-f', dest_path ] logger.debug("cbfstool extract %s to %s", member_name, dest_path) subprocess.check_call(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) return dest_path
def extract(self, member_name, dest_dir): dest_path = os.path.join(dest_dir, os.path.basename(member_name)) logger.debug('libarchive extracting %s to %s', member_name, dest_path) with libarchive.file_reader(self.source.path) as archive: for entry in archive: if entry.pathname == member_name: logger.debug('entry found, writing %s', dest_path) with open(dest_path, 'wb') as f: for buf in entry.get_blocks(): f.write(buf) return dest_path raise KeyError('%s not found in archive', member_name)
def validate_checksums(self, check_hash="sha1"): """ Validate checksums for a package, using ``check_hack``'s type to validate the package. Valid ``check_hash`` types: * sha1 * sha256 * md5 * md5sum """ logger.debug("validating %s checksums", check_hash) for filename in self.get_files(): if check_hash == "sha1": hash_type = hashlib.sha1() checksums = self.get("Checksums-Sha1") field_name = "sha1" elif check_hash == "sha256": hash_type = hashlib.sha256() checksums = self.get("Checksums-Sha256") field_name = "sha256" elif check_hash == "md5": hash_type = hashlib.md5() checksums = self.get("Files") field_name = "md5sum" changed_files = None # appease pylint for changed_files in checksums: if changed_files['name'] == os.path.basename(filename): break else: assert ( "get_files() returns different files than Files: knows?!") with open(os.path.join(self._directory, filename), "rb") as fc: while True: chunk = fc.read(131072) if not chunk: break hash_type.update(chunk) fc.close() if not hash_type.hexdigest() == changed_files[field_name]: raise ChangesFileException( "Checksum mismatch for file %s: %s != %s" % (filename, hash_type.hexdigest(), changed_files[field_name])) else: logger.debug("%s Checksum for file %s matches", field_name, filename)
def get_debug_link(path): try: output = subprocess.check_output( ['readelf', '--string-dump=.gnu_debuglink', path]) except subprocess.CalledProcessError as e: logger.debug('Unable to get Build Id for %s: %s', path, e) return None m = re.search(r'^\s+\[\s+0\]\s+(\S+)$', output.decode('utf-8', errors='replace'), flags=re.MULTILINE) if not m: return None return m.group(1)
def output_html_directory(directory, difference, css_url=None, jquery_url=None): """ Multi-file presenter. Writes to a directory, and puts large diff tables into files of their own. This uses jQuery. By default it uses /usr/share/javascript/jquery/jquery.js (symlinked, so that you can still share the result over HTTP). You can also pass --jquery URL to diffoscope to use a central jQuery copy. """ if not os.path.exists(directory): os.makedirs(directory) if not jquery_url: jquery_symlink = os.path.join(directory, "jquery.js") if os.path.exists(jquery_symlink): jquery_url = "./jquery.js" else: if os.path.lexists(jquery_symlink): os.unlink(jquery_symlink) for path in JQUERY_SYSTEM_LOCATIONS: if os.path.exists(path): os.symlink("/usr/share/javascript/jquery/jquery.js", jquery_symlink) jquery_url = "./jquery.js" break if not jquery_url: logger.warning( '--jquery was not specified and jQuery was not found in any known location. Disabling on-demand inline loading.' ) logger.debug('Locations searched: %s', ', '.join(JQUERY_SYSTEM_LOCATIONS)) if jquery_url == 'disable': jquery_url = None with file_printer(directory, "index.html") as print_func: print_func = create_limited_print_func(print_func, Config.general.max_report_size) try: output_header(css_url, print_func) output_difference(difference, print_func, css_url, directory, []) except PrintLimitReached: logger.debug('print limit reached') print_func(u"<div class='error'>Max output size reached.</div>", force=True) if jquery_url: print_func(SCRIPTS % {'jquery_url': escape(jquery_url)}, force=True) output_footer(print_func)
def compare_files(file1, file2, source=None): logger.debug('compare files %s and %s', file1, file2) if file1.has_same_content_as(file2): logger.debug('same content, skipping') return None specialize(file1) specialize(file2) if isinstance(file1, NonExistingFile): file1.other_file = file2 elif isinstance(file2, NonExistingFile): file2.other_file = file1 elif file1.__class__.__name__ != file2.__class__.__name__: return file1.compare_bytes(file2, source) return file1.compare(file2, source)
def comparisons(self, other): if self.source: my_md5sums = self.source.container.source.container.source.md5sums else: my_md5sums = {} if other.source: other_md5sums = other.source.container.source.container.source.md5sums else: other_md5sums = {} for my_member, other_member, comment in super().comparisons(other): if my_member.name == other_member.name and \ my_md5sums.get(my_member.name, 'my') == other_md5sums.get(other_member.name, 'other'): logger.debug('Skip %s: identical md5sum', my_member.name) continue yield my_member, other_member, comment
def output_html(difference, css_url=None, print_func=None): """ Default presenter, all in one HTML file """ if print_func is None: print_func = print print_func = create_limited_print_func(print_func, Config.general.max_report_size) try: output_header(css_url, print_func) output_difference(difference, print_func, css_url, None, []) except PrintLimitReached: logger.debug('print limit reached') print_func(u"<div class='error'>Max output size reached.</div>", force=True) output_footer(print_func)
def lookup_file(self, *names): """Try to fetch a specific file by digging in containers.""" name, remainings = names[0], names[1:] try: file = self.get_member(name) except KeyError: return None logger.debug('lookup_file(%s) -> %s', names, file) diffoscope.comparators.specialize(file) if not remainings: return file container = file.as_container if not container: return None return container.lookup_file(*remainings)
def compare(self, other, source=None): # So now that comparators are all object-oriented, we don't have any clue on how to # perform a meaningful comparison right here. So we are good do the comparison backward # (where knowledge of the file format lies) and and then reverse it. if isinstance(other, NonExistingFile): return Difference( None, self.name, other.name, comment='Trying to compare two non-existing files.') logger.debug('Performing backward comparison') backward_diff = other.compare(self, source) if not backward_diff: return None return backward_diff.get_reverse()
def extract(self, member_name, dest_dir): dest_name = os.path.basename(member_name) if not dest_name: raise ValueError('member_name should not be a directory') dest_path = os.path.join(dest_dir, dest_name) logger.debug('libarchive extracting %s to %s', member_name, dest_path) with libarchive.file_reader(self.source.path) as archive: for entry in archive: if entry.pathname == member_name: logger.debug('entry found, writing %s', dest_path) with open(dest_path, 'wb') as f: for buf in entry.get_blocks(): f.write(buf) return dest_path raise KeyError('%s not found in archive', member_name)
def output_unified_diff(print_func, css_url, directory, unified_diff): if directory and len(unified_diff) > Config.general.separate_file_diff_size: # open a new file for this table filename="%s.html" % hashlib.md5(unified_diff.encode('utf-8')).hexdigest() logger.debug('separate html output for diff of size %d', len(unified_diff)) with file_printer(directory, filename) as new_print_func: output_header(css_url, new_print_func) output_unified_diff_table(new_print_func, unified_diff) output_footer(new_print_func) print_func("<div class='ondemand'>\n") print_func("... <a href='%s'>load diff</a> ...\n" % escape(filename)) print_func("</div>\n") else: output_unified_diff_table(print_func, unified_diff)
def has_same_content_as(self, other): logger.debug('%s has_same_content %s', self, other) # try comparing small files directly first try: my_size = os.path.getsize(self.path) other_size = os.path.getsize(other.path) except OSError: # files not readable (e.g. broken symlinks) or something else, # just assume they are different return False if my_size == other_size and my_size <= SMALL_FILE_THRESHOLD: if open(self.path, 'rb').read() == open(other.path, 'rb').read(): return True return 0 == subprocess.call(['cmp', '-s', self.path, other.path], shell=False, close_fds=True)
def filter(self, line): if not self._encoding: self._header.write(line) if line == b'\n': logger.debug("unable to determine PO encoding, let's hope it's utf-8") self._encoding = 'utf-8' return self._header.getvalue() found = Msgunfmt.CHARSET_RE.match(line) if found: self._encoding = found.group(1).decode('us-ascii').lower() return self._header.getvalue().decode(self._encoding).encode('utf-8') return b'' if self._encoding != 'utf-8': return line.decode(self._encoding).encode('utf-8') else: return line
def filter(self, line): if not self._encoding: self._header.write(line) if line == b'\n': logger.debug( "unable to determine PO encoding, let's hope it's utf-8") self._encoding = 'utf-8' return self._header.getvalue() found = Msgunfmt.CHARSET_RE.match(line) if found: self._encoding = found.group(1).decode('us-ascii').lower() return self._header.getvalue().decode( self._encoding).encode('utf-8') return b'' if self._encoding != 'utf-8': return line.decode(self._encoding).encode('utf-8') else: return line
def compare(self, other, source=None): if other.path is None: return None try: my_md5sums = Md5sumsFile.parse_md5sums(self.path) other_md5sums = Md5sumsFile.parse_md5sums(other.path) same = set() for path in my_md5sums.keys() & other_md5sums.keys(): if my_md5sums[path] == other_md5sums[path]: same.add('./%s' % path) self.container.source.container.source.container.source.set_files_with_same_content_in_data(same) logger.debug('Identifed %d files as identical in data archive', len(same)) return Difference(None, self.path, other.path, source='md5sums', comment="Files in package differs") except ValueError as e: difference = self.compare_bytes(other) difference.add_comment('Malformed md5sums file: %s' % e) return difference
def output_unified_diff(print_func, css_url, directory, unified_diff): if directory and len( unified_diff) > Config.general.separate_file_diff_size: # open a new file for this table filename = "%s.html" % hashlib.md5( unified_diff.encode('utf-8')).hexdigest() logger.debug('separate html output for diff of size %d', len(unified_diff)) with file_printer(directory, filename) as new_print_func: output_header(css_url, new_print_func) output_unified_diff_table(new_print_func, unified_diff) output_footer(new_print_func) print_func("<div class='ondemand'>\n") print_func("... <a href='%s'>load diff</a> ...\n" % escape(filename)) print_func("</div>\n") else: output_unified_diff_table(print_func, unified_diff)