def _claim_go_executables(self): """Claim executables identified by goversion.""" not_container_msg = 'Skipping archive {0} since it\'s not a container image' archives = self.read_metadata_file(self.ARCHIVE_FILE) for index, archive in enumerate(archives): if not self.is_container_archive(archive): log.debug(not_container_msg.format(archive['id'])) continue layer_dir = os.path.join(self.input_dir, self.UNPACKED_CONTAINER_LAYER_DIR, archive['filename']) cmd = [self.GOVERSION, '.'] log.info(f'Running {cmd}') gv = subprocess.Popen(cmd, cwd=layer_dir, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = gv.communicate() returncode = gv.wait() if returncode: raise RuntimeError( f'The command "{" ".join(cmd)}" failed with: {stderr}') for line in stdout.splitlines(): path, _ = line.split(' ', 1) log.info( f'(archive {index+1}/{len(archives)}) Claiming {path}') self.claim_container_file(archive, path)
def local_lookup(self, loose_artifact): """ Lookup the given file locally to see if we already know about it. Uses sha256 checksum to make that determination. :param str loose_artifact: The full path to the file in question. :raises FileNotFoundError: if the file could not be found to checksum. :return: The Artifact that we discovered with a local lookup, or None. :rtype: Artifact or None """ sha256_checksum = self.checksum(loose_artifact) try: checksum_node = content.Checksum.nodes.first( checksum=sha256_checksum) except content.Checksum.DoesNotExist: return None # According to the schema a checksum can be associated with multiple Artifacts, but # according to reality that doesn't make much sense. Just return the "first one". artifacts = checksum_node.artifacts.all() if artifacts: log.info(f'Artifact already in database: {loose_artifact}') return artifacts[0] else: return None
def run(self): """ Start the container RPM analyzer. :raises AnalysisFailure: if the analyzer completed with errors """ build_info = self.read_metadata_file(self.BUILD_FILE) build_id = build_info['id'] if build_info['type'] != self.CONTAINER_BUILD_TYPE: log.info( f'Skipping build {build_id} because the build is not a container' ) return # Create a mapping of arch to archive (container image) so we can easily map to the # parent container archives in a future loop arch_to_archive = {} not_container_msg = 'Skipping archive {0} since it\'s not a container image' for archive in self.read_metadata_file(self.ARCHIVE_FILE): if not self.is_container_archive(archive): log.debug(not_container_msg.format(archive['id'])) continue arch = archive['extra']['image']['arch'] if arch in arch_to_archive: log.error( f'Build {build_id} has more than one container image with the arch {arch}' ) continue arch_to_archive[arch] = archive parent_build_id = build_info['extra']['image'].get('parent_build_id') # If there is a parent to this image, then only get the RPMs installed in this layer # and mark them as embedded artifacts on this container image if parent_build_id is not None: # Find the RPMs installed in this layer versus the parent image for archive in self.koji_session.listArchives(parent_build_id): if not self.is_container_archive(archive): log.debug(not_container_msg.format(archive['id'])) continue arch = archive['extra']['image']['arch'] if arch not in arch_to_archive: log.debug( f'The parent build {parent_build_id} contains an extra arch of {arch}' ) continue rpms = self._get_rpms_diff(archive['id'], arch_to_archive[arch]['id']) self._process_embedded_rpms(arch_to_archive[arch], rpms) # If there is no parent, then this is a base image. Just get all the RPMs installed in # the image and mark them as embedded artifacts in this container image. else: image_rpm_file = self.read_metadata_file(self.IMAGE_RPM_FILE) for archive in arch_to_archive.values(): rpms = image_rpm_file.get(str(archive['id'])) self._process_embedded_rpms(archive, rpms) # Claim all files from installed RPMs. self._claim_rpm_files(arch_to_archive.values())
def unpack_zip(zip_file, output_dir): # pragma: no cover """ Unpack a ZIP-like archive file to the specified directory. :param str zip_file: the path to the archive file to unpack :param str output_dir: the path to unpack the archive to """ with zipfile.ZipFile(zip_file) as zip_: zip_.extractall(output_dir) log.info(f'Successfully unpacked {zip_file} to {output_dir}')
def unpack_tar(tar_file, output_dir): # pragma: no cover """ Unpack a TAR-like archive file to the specified directory. :param str tar_file: the path to the archive file to unpack :param str output_dir: the path to unpack the archive to """ with tarfile.open(tar_file) as tar: tar.extractall(output_dir) log.info(f'Successfully unpacked {tar_file} to {output_dir}')
def unpack_artifacts(artifacts, output_dir): """ Unpack a list of artifacts to the specified directory. :param list artifacts: a list of paths to artifacts to unpack :param str output_dir: a path to a directory to unpack the artifacts """ if output_dir and not os.path.isdir(output_dir): raise RuntimeError( f'The passed in directory of "{output_dir}" does not exist') for artifact in artifacts: if not os.path.isfile(artifact): raise RuntimeError(f'The artifact "{artifact}" could not be found') artifact_filename = os.path.split(artifact)[-1] log.info(f'Unpacking {artifact_filename}') if artifact_filename.startswith( 'docker-image') and artifact_filename.endswith('.tar.gz'): output_subdir = os.path.join(output_dir, 'container_layer', artifact_filename) os.makedirs(output_subdir) unpack_container_image(artifact, output_subdir) elif artifact_filename.endswith('.rpm'): output_subdir = os.path.join(output_dir, 'rpm', artifact_filename) os.makedirs(output_subdir) unpack_rpm(artifact, output_subdir) elif zipfile.is_zipfile(artifact): output_subdir = os.path.join(output_dir, 'non-rpm', artifact_filename) os.makedirs(output_subdir) unpack_zip(artifact, output_subdir) elif tarfile.is_tarfile(artifact): output_subdir = os.path.join(output_dir, 'non-rpm', artifact_filename) os.makedirs(output_subdir) unpack_tar(artifact, output_subdir) else: # Files such as .pom do not need to be unpacked, others such as .gem are not yet # supported. log.info( f'Skipping unpacking (unsupported archive type or not an archive): {artifact}' ) continue
def download_source(build_info, output_dir, sources_cmd=None): """ Download the source (from dist-git) that was used in the specified build. :param dict build_info: build information from koji.getBuild() :param str output_dir: the path to download the source to :param list sources_cmd: command to run to download source artifacts, or None for the default (['rhpkg', 'sources']) """ if sources_cmd is None: sources_cmd = ['rhpkg', '--user=1001', 'sources'] # Make sure the commands we'll run are installed assert_command('git') assert_command(sources_cmd[0]) url, commit_id = parse_source_url(build_info['source']) log.info(f'Cloning source for {build_info["id"]}') cmd = ['git', 'clone', url, output_dir] process = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) _, error_output = process.communicate() error_output = error_output.decode('utf-8') if process.returncode != 0: raise RuntimeError(f'The command "{" ".join(cmd)}" failed with: {error_output}') cmd = ['git', 'reset', '--hard', commit_id] process = subprocess.Popen(cmd, cwd=output_dir, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) _, error_output = process.communicate() error_output = error_output.decode('utf-8') if process.returncode != 0: if 'Could not parse object' in error_output: raise BuildSourceNotFound( f'Commit {commit_id} was not found in {url} in build {build_info["id"]}' ) raise RuntimeError(f'The command "{" ".join(cmd)}" failed with: {error_output}') log.info(f'Downloading sources for {build_info["id"]}') process = subprocess.Popen(sources_cmd, cwd=output_dir, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) _, error_output = process.communicate() error_output = error_output.decode('utf-8') if process.returncode != 0: raise RuntimeError(f'The command "{" ".join(cmd)}" failed with: {error_output}')
def download_build(build_info, output_dir): """ Download the artifacts associated with a Koji build. :param dict build_info: the build information from koji :param str output_dir: the path to download the archives to :return: a list of downloaded artifacts :rtype: list """ # Make sure the Koji command is installed assert_command('koji') if not os.path.isdir(output_dir): raise RuntimeError(f'The passed in directory of "{output_dir}" does not exist') if not build_info: raise RuntimeError(f'The Koji build cannot be None') # There's no API for this, so it's better to just call the CLI directly cmd = ['koji', '--profile', config.koji_profile, 'download-build', str(build_info['id'])] # Because builds may contain artifacts of different types (e.g. RPMs as well as JARs), # cycle through all types of artifacts: RPMs (default), Maven archives (--type maven), # and container images (--type image); purposefully ignoring Windows builds for now (--type # win). build_type_opts = ([], ['--type', 'maven'], ['--type', 'image']) log.info(f'Downloading build {build_info["id"]} from Koji') download_prefix = 'Downloading: ' artifacts = [] for build_type in build_type_opts: download_cmd = cmd + build_type p = subprocess.Popen(download_cmd, cwd=output_dir, stdout=subprocess.PIPE) # For some reason, any errors are streamed to stdout and not stderr output, _ = p.communicate() output = output.decode('utf-8') if p.returncode != 0: if 'No' in output and 'available' in output: continue raise RuntimeError(f'The command "{" ".join(cmd)}" failed with: {output}') for line in output.strip().split('\n'): if line.startswith(download_prefix): file_path = os.path.join(output_dir, line.split(download_prefix)[-1].lstrip('/')) artifacts.append(file_path) log.info(f'Downloaded {os.path.split(file_path)[-1]}') return artifacts
def unpack_rpm(rpm_file, output_dir): """ Unpack the RPM file to the specified directory. :param str rpm_file: the path to the RPM to unpack :param str output_dir: the path to unpack the RPM to """ assert_command('rpm2cpio') assert_command('cpio') # Get the CPIO file cpio_file = _rpm_to_cpio(rpm_file) # Unpack the CPIO file _unpack_cpio(cpio_file, output_dir) log.info(f'Successfully unpacked {os.path.split(rpm_file)[-1]} to {output_dir}')
def run(self): """ Start the post analyzer. :raises AnalysisFailure: if the analyzer completed with errors """ build_info = self.read_metadata_file(self.BUILD_FILE) build_id = build_info['id'] if build_info['type'] != self.CONTAINER_BUILD_TYPE: # Post analysis consists of recording unknown files, which only makes sense for # container builds. RPM or maven builds will not include any unindentified files. log.info( f'Skipping build {build_id} because the build is not a container' ) return # Dir of all unpacked container content unpacked_container_layer = os.path.join( self.input_dir, self.UNPACKED_CONTAINER_LAYER_DIR) for archive in os.listdir(unpacked_container_layer): path_to_archive = os.path.join(unpacked_container_layer, archive) # Assume that the artifact being analyzed was created by the main analyzer. archive_obj = content.Artifact.nodes.get(filename=archive) for unknown_file in self.walk(path_to_archive): path, filename = os.path.split( os.path.relpath(unknown_file, path_to_archive)) if path.startswith(IGNORED_DIRS): continue log.info( f'Found unknown file in {archive}: /{path}/{filename}') unknown_file = content.UnknownFile.get_or_create({ 'checksum': self.checksum(unknown_file), 'filename': filename, 'path': '/' + path, # Add leading root dir })[0] self.conditional_connect(archive_obj.unknown_files, unknown_file)
def download_source(build_info, output_dir): """ Download the source (from dist-git) that was used in the specified build. :param build_info: build information from koji.getBuild() :param output_dir: the path to download the source to """ # Make sure the git command is installed _assert_command('git') source_url = build_info.get('source') if not source_url: raise RuntimeError( f'Build {build_info["id"]} has no associated source URL.') log.info(f'Downloading source for {build_info["id"]}') url, _, commit_id = source_url.partition('#') component = url.split('/')[-1] cmd = ['git', 'clone', url] process = subprocess.Popen(cmd, cwd=output_dir, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) _, error_output = process.communicate() error_output = error_output.decode('utf-8') if process.returncode != 0: raise RuntimeError( f'The command "{" ".join(cmd)}" failed with: {error_output}') cmd = ['git', 'reset', '--hard', commit_id] subprocess.Popen( cmd, cwd=os.path.join(output_dir, component), stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, ) _, error_output = process.communicate() error_output = error_output.decode('utf-8') if process.returncode != 0: raise RuntimeError( f'The command "{" ".join(cmd)}" failed with: {error_output}')
def _run_retrodep(self, srcdir, import_path=None, excludes=None, opts=None): """Run retrodep and returns its output. :param srcdir: path to source code to examine :param str/None import_path: import path for top-level module :param list/None excludes: list of globs to ignore :param list/None opts: any additional parameters :return: output from command :rtype: (str, str) """ with tempfile.NamedTemporaryFile(mode='wt') as excludes_file: options = ['-debug', '-x', '-template', self.RETRODEP_TEMPLATE] if import_path: options += ['-importpath', import_path] if excludes: excludes_file.write(''.join('%s\n' % e for e in excludes)) excludes_file.flush() options += ['-exclude-from', excludes_file.name] if opts: options += opts cmd = [self.RETRODEP] + options + [srcdir] log.info(f'Running {cmd}') bv = subprocess.Popen(cmd, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = bv.communicate() returncode = bv.wait() if returncode: raise RuntimeError( f'The command "{" ".join(cmd)}" failed with: {stderr}') return stdout, stderr
def add_to_and_maybe_execute_batch(self, loose_artifact, path_to_archive, claim=False): """ Add the given file to the koji multicall batch. If the batch is full, execute it and return the resulting Artifacts. Else return empty list. :param str loose_artifact: The absolute path to the file in question. :param str path_to_archive: The absolute path to the archive we are currently exporing. :param bool claim: If we should claim the file if we discover an artifact. Default False. :return: A list of Artifacts created, or empty list. :rtype: list """ if not self.batch: # We're at the beginning of a new batch, initialize the koji multicall session self.koji_session.multicall = True relative_filepath = os.path.relpath(loose_artifact, path_to_archive) # queue up the koji calls if loose_artifact.endswith('.rpm'): rpm = os.path.basename(loose_artifact) log.info(f'Looking up RPM in Koji: {loose_artifact}') self.koji_session.getRPM(rpm) else: md5_checksum = self.checksum(loose_artifact, md5) log.info( f'Looking up archive in Koji: {md5_checksum}, {loose_artifact}' ) self.koji_session.listArchives(checksum=md5_checksum) self.batch.append((path_to_archive, relative_filepath)) if len(self.batch) >= self.KOJI_BATCH_SIZE: return self.execute_batch_and_return_artifacts(claim) return []
def unpack_container_image(container_image_file, output_dir): """ Unpack a container image to the specified directory. :param str container_image_file: the path to the container image file to unpack :param str output_dir: the path to unpack the container image to """ # Unpack the manifest.json file from which we figure out the latest image layer with tarfile.open(container_image_file) as tar: manifest_file = tar.extractfile('manifest.json') manifest_data = json.loads(manifest_file.read().decode('utf-8')) layer_to_unpack = manifest_data[0]['Layers'][-1] # Unpack the last layer, which itself is a .tar file tar.extract(layer_to_unpack) # Extract the file system contents from the last layer with tarfile.open(layer_to_unpack) as tar: tar.extractall(output_dir) # Remove extracted layer .tar file shutil.rmtree(os.path.split(layer_to_unpack)[0]) log.info(f'Successfully unpacked {container_image_file} to {output_dir}')
def run(self): """ Start the loose RPM analyzer. :raises AnalysisFailure: if the analyzer completed with errors """ build_info = self.read_metadata_file(self.BUILD_FILE) self.build_id = build_info['id'] build_type = build_info['type'] if build_type not in self.SUPPORTED_BUILD_TYPES: log.info( f'Skipping build {self.build_id} because the build type "{build_type}" ' f'is not supported') return self.batch = [] # Examine the source for embedded artifacts. source_path = os.path.join(self.input_dir, self.SOURCE_DIR) source_embedded_artifacts = [] for loose_artifact in self.walk(source_path, extensions=self.FILE_EXTENSIONS): # If we find it locally don't bother asking Koji about it again. artifact = self.local_lookup(loose_artifact) if artifact: source_embedded_artifacts.append(artifact) continue for artifact in self.add_to_and_maybe_execute_batch( loose_artifact, source_path): source_embedded_artifacts.append(artifact) # Wrap up any in-progress batch before moving on to the archives. for artifact in self.execute_batch_and_return_artifacts(): source_embedded_artifacts.append(artifact) # Now examine the build artifacts. for archive, path_to_archive in self.unpacked_archives(): # Assume that the artifact being analyzed was created by the main analyzer original_artifact = content.Artifact.nodes.get(filename=archive) # Assume that every artifact found in the source is embedded in every built artifact. for source_artifact in source_embedded_artifacts: original_artifact.embedded_artifacts.connect(source_artifact) for loose_artifact in self.walk(path_to_archive, extensions=self.FILE_EXTENSIONS): relative_filepath = os.path.relpath(loose_artifact, path_to_archive) try: artifact = self.local_lookup(loose_artifact) except FileNotFoundError: # There are two potential causes here, both with symlinks: # 1) There is a symlink that points to a file in a different # layer of the container. # 2) It was a symlink to something we already analyzed and # claimed. # # Either way I don't think we really care. If it's already # claimed then we've already established the link to this # artifact. If it's referenceing something on a different # layer of the container then we'll find it when we analyse # that build (and that's the layer that needs to be respun # anyway, since that's what contains the actual thing). # Let's just claim the file and move on. log.warning( f'Skipping already-claimed symlink in {archive}: ' f'{relative_filepath}') self.claim_file(path_to_archive, relative_filepath) continue # If we find it locally don't bother asking Koji about it again. if artifact: self.conditional_connect( original_artifact.embedded_artifacts, artifact) self.claim_file(path_to_archive, relative_filepath) continue # Add the file to the batch of things to process. If this happens to # trigger a batch execution, handle the resulting Artifacts. for artifact in self.add_to_and_maybe_execute_batch( loose_artifact, path_to_archive, claim=True): self.conditional_connect( original_artifact.embedded_artifacts, artifact) # Wrap up any in-progress batch before moving on to the next archive. for artifact in self.execute_batch_and_return_artifacts( claim=True): self.conditional_connect(original_artifact.embedded_artifacts, artifact)
def run(self): """ Start the container Go analyzer. :raises AnalysisFailure: if the analyzer completed with errors """ # Check we have access to the executables we need. assert_command(self.RETRODEP) assert_command(self.GOVERSION) build_info = self.read_metadata_file(self.BUILD_FILE) build_id = build_info['id'] if build_info['type'] != self.CONTAINER_BUILD_TYPE: log.info( f'Skipping build {build_id} because the build is not a container' ) return # This container's build is assumed to exist since it is # created by the main analyzer. build = content.Build.nodes.get(id_=build_id) source_locations = build.source_location.all() try: source_location = source_locations[0] except IndexError: msg = f'Missing source location for container build {build_id}' log.error(msg) raise AnalysisFailure(msg) srcdir = os.path.join(self.input_dir, self.SOURCE_DIR) # Store the failure messages so they can be returned in an AnalysisFailure exception failures = [] failed_src_exc_msg = 'Failed while processing the source in "{}"' failed_src_msg = 'Failed while processing the source in "{}" with "{}"' # First process the source code that's directly available in # the dist-git repository. try: self._process_git_source(source_location, srcdir) except RuntimeError as error: log.exception(failed_src_exc_msg.format(srcdir)) failures.append(failed_src_msg.format(srcdir, error)) # Next process source code from archives (from 'rhpkg sources'). # Look for tar archives and zip archives. tar_archives = glob(os.path.join(srcdir, '*.tar.*')) zip_archives = glob(os.path.join(srcdir, '*.zip')) archives = [(unpack_tar, archive) for archive in tar_archives] archives += [(unpack_zip, archive) for archive in zip_archives] for unpack, archive in archives: with tempfile.TemporaryDirectory() as subsrc: unpack(archive, subsrc) try: self._process_source_code(source_location, subsrc) except RuntimeError as error: log.exception(failed_src_exc_msg.format(srcdir)) failures.append(failed_src_msg.format(subsrc, error)) # Now claim all the Go executables. self._claim_go_executables() if failures: raise AnalysisFailure( 'GoAnalyzer completed with the following error(s): \n {}'. format("\n ".join(failures)))
def execute_batch_and_return_artifacts(self, claim=False): """ Execute the stored Koji batch and return the Artifacts created. :param bool claim: If we should claim the file if we discover an artifact. Default False. :return: A list of Artifacts created. :rtype: list """ ret = [] if not self.batch: return ret # gracefully exit early if batch is empty responses = self.koji_session.multiCall() # Process the individual responses. Responses are returned in the same # order the calls are added, so we can zip it up to pair back with the # file path. for (path_to_archive, relative_filepath), response in zip(self.batch, responses): archive = os.path.basename(path_to_archive) is_rpm = relative_filepath.endswith('.rpm') # If Koji could not find it or there was some other error, log it # and continue. Response is either a dict if an error, or a list of # one element if found. if isinstance(response, dict): log.error( f'Error received from Koji looking up {relative_filepath}' f' embedded in {archive} in build {self.build_id}. Koji error ' f'{response["faultString"]}') continue artifact_info = response[0] if not artifact_info: log.info( f'Cannot find build for {relative_filepath} embedded in ' f'{archive} in build {self.build_id}.') continue if not is_rpm: # listArchives returns a list where getRPM returns a hash directly artifact_info = artifact_info[0] artifact_build_id = artifact_info.get('build_id') if not artifact_build_id: log.error(f'Empty build found in Koji for {relative_filepath} ' f'embedded in {archive} in build {self.build_id}') continue log.info( f'Linking discovered embedded artifact {relative_filepath} ' f'embedded in {archive} in build {self.build_id}') artifact_build = content.Build.get_or_create({ 'id_': artifact_build_id, 'type_': 'build' if is_rpm else artifact_info['btype'], # TODO bug! })[0] if is_rpm: artifact = self.create_or_update_rpm_artifact_from_rpm_info( artifact_info) else: artifact = self.create_or_update_archive_artifact_from_archive_info( artifact_info) self.conditional_connect(artifact.build, artifact_build) ret.append(artifact) if claim: self.claim_file(path_to_archive, relative_filepath) # Clear the processed batch. self.batch = [] return ret
def run(self): """ Start the container analyzer. :raises AnalysisFailure: if the analyzer completed with errors """ build_info = self.read_metadata_file(self.BUILD_FILE) build_id = build_info['id'] if build_info['type'] != self.CONTAINER_BUILD_TYPE: log.info( f'Skipping build {build_id} because the build is not a container' ) return # If this build has no parent image build, there is nothing to do here. parent_build_id = build_info['extra']['image'].get('parent_build_id') if parent_build_id is None: return # This container's build is assumed to exist since it is created by the main analyzer. build = content.Build.nodes.get(id_=build_id) # Process parent build and embed all artifacts of the parent build to the artifacts of # this build's artifacts. arch_to_artifact = self._create_or_update_parent(parent_build_id) for archive in build.artifacts.filter(type_='container').all(): related_archive = arch_to_artifact.get(archive.architecture) if not related_archive: log.error( 'no artifact to link to, architecture does not exist in parent build' ) continue archive.embedded_artifacts.connect(related_archive) image_info = build_info['extra']['image'] try: parent_image_builds = image_info['parent_image_builds'].values() # Process parent builds used as buildroots (those specified in `parent_image_builds` # besides the `parent_build_id`. Embed all artifacts of each parent build as buildroot # artifacts of this build's artifacts. parent_image_builds_ids = { build['id'] for build in parent_image_builds if build['id'] != parent_build_id } except KeyError: # Older builds had different metadata in the extra field. parent_image_builds_ids = [image_info['parent_build_id']] for buildroot_parent_build_id in parent_image_builds_ids: arch_to_artifact = self._create_or_update_parent( buildroot_parent_build_id) for archive in build.artifacts.filter(type_='container').all(): related_archive = arch_to_artifact.get(archive.architecture) if not related_archive: log.error( 'no artifact to link to, architecture does not exist in parent build' ) continue archive.buildroot_artifacts.connect(related_archive)