def _generate_pom_xml(to_solve): """ Create pom.xml with dependencies from to_solve and run 'mvn versions:resolve-ranges', which resolves the version ranges (overwrites the pom.xml). :param to_solve: {"groupId:artifactId": "version-range"} """ project = etree.Element('project') etree.SubElement(project, 'modelVersion').text = '4.0.0' etree.SubElement(project, 'groupId').text = 'foo.bar.baz' etree.SubElement(project, 'artifactId').text = 'testing' etree.SubElement(project, 'version').text = '1.0.0' dependencies = etree.SubElement(project, 'dependencies') for name, version_range in to_solve.items(): group_id, artifact_id = name.rstrip(':').split(':') dependency = etree.SubElement(dependencies, 'dependency') etree.SubElement(dependency, 'groupId').text = group_id etree.SubElement(dependency, 'artifactId').text = artifact_id etree.SubElement(dependency, 'version').text = version_range with open('pom.xml', 'wb') as pom: pom.write( etree.tostring(project, xml_declaration=True, pretty_print=True)) TimedCommand.get_command_output(['mvn', 'versions:resolve-ranges'], graceful=False)
def zip_file(file, archive, junk_paths=False): command = ['zip', archive, file] if junk_paths: # Store just the name of a saved file (junk the path), not directory names. # By default, zip will store the full path (relative to the current directory). command.extend(['--junk-paths']) TimedCommand.get_command_output(command)
def archive(self, basename): suffix = "tar.gz" filename = basename + "." + suffix TimedCommand.get_command_output([ "git", "archive", "--format={}".format(suffix), "--output={}".format(filename), "HEAD" ]) return filename
def add(self, path): """ add path to index :param path: str """ with cwd(self.repo_path): TimedCommand.get_command_output(["git", "add", path], graceful=False)
def extract_zip(target, dest, mkdest=False): if mkdest: try: os.mkdir(dest, mode=0o775) except FileExistsError: pass # -o: overwrite existing files without prompting TimedCommand.get_command_output(['unzip', '-o', '-d', dest, target]) # Fix possibly wrong permissions in zip files that would prevent us from deleting files. TimedCommand.get_command_output(['chmod', '-R', 'u+rwX,g+rwX', dest])
def commit(self, message='blank'): """ commit git repository :param message: str, commit message """ # --git-dir is #$%^&& # http://stackoverflow.com/questions/1386291/git-git-dir-not-working-as-expected with cwd(self.repo_path): TimedCommand.get_command_output(["git", "commit", "-m", message], graceful=False)
def create_git(cls, path): """ initiate new git repository at path :param path: str :return: instance of Git() """ cls.config() TimedCommand.get_command_output(["git", "init", path], graceful=False) return cls(path=path)
def clone(cls, url, path): """ clone repository provided as url to specific path :param url: str :param path: str :return: instance of Git() """ cls.config() TimedCommand.get_command_output(["git", "clone", url, path], graceful=False) return cls(path=path)
def _use_maven_index_checker(self): maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH') target_dir = os.path.join(maven_index_checker_dir, 'target') s3 = StoragePool.get_connected_storage('S3MavenIndex') self.log.info('Fetching pre-built maven index from S3, if available.') s3.retrieve_index_if_exists(target_dir) index_range = '{}-{}'.format(self.count.min, self.count.max) command = [ 'java', '-Xmx768m', '-jar', 'maven-index-checker.jar', '-r', index_range ] with cwd(maven_index_checker_dir): output = TimedCommand.get_command_output(command, is_json=True, graceful=False, timeout=1200) for idx, release in enumerate(output): name = '{}:{}'.format(release['groupId'], release['artifactId']) version = release['version'] self.log.info("Scheduling #%d.", self.count.min + idx) self.analyses_selinon_flow(name, version) # index checker should clean up these dirs in /temp/ after itself, but better be sure for mindexerdir in glob.glob( os.path.join(gettempdir(), 'mindexer-ctxcentral-context*')): rmtree(mindexerdir) self.log.info('Storing pre-built maven index to S3') s3.store_index(target_dir) central_index_dir = os.path.join(target_dir, 'central-index') rmtree(central_index_dir)
def add_and_commit_everything(self, message="blank"): """ equiv of: git add . git commit -m everything :param message: str, commit message """ # first we need to remove any .git dirs/files from the archive, they could contain # directions that would break adding (e.g. Flask 0.10 contains .git with gitpath # pointing to Mitsuhiko's home dir) TimedCommand.get_command_output(['find', self.repo_path, '-mindepth', '2', '-name', '.git', '-exec', 'rm', '-rf', '{}', ';']) # add everything self.add(self.repo_path) self.commit(message=message)
def worker(path): mime = TimedCommand.get_command_output(['file', path, '-b', '-i']).pop() self.log.debug("%s mime = %s", path, mime) typ = TimedCommand.get_command_output(['file', path, '-b']) self.log.debug("%s filetype = %s", path, typ) linguist = None if 'charset=binary' not in mime: linguist = self._parse_linguist( TimedCommand.get_command_output(['linguist', path])) self.log.debug("%s linguist output = %s", path, linguist) results.append({ "type": typ, "output": linguist, "path": os.path.relpath(path, cache_path), })
def clone(cls, url, path, depth=None, branch=None): """ clone repository provided as url to specific path :param url: str :param path: str :param depth: str :param branch: str :return: instance of Git() """ cls.config() cmd = ["git", "clone", url, path] if depth is not None: cmd.extend(["--depth", depth]) if branch is not None: cmd.extend(["--branch", branch]) TimedCommand.get_command_output(cmd, graceful=False) return cls(path=path)
def compute_ssdeep(self, target): """ Compute SSdeep piece-wise linear hash of target """ # 0 : ssdeep header # 1 : hash,filename data = TimedCommand.get_command_output(['ssdeep', '-c', '-s', target]) try: return data[1].split(',')[0].strip() except IndexError: self.log.error("unable to compute ssdeep of %r", target) raise RuntimeError("can't compute digest of %r" % target)
def rev_parse(self, args=None): """ :param args: arguments to pass to `git rev-parse` :return: [str], output from `git rev-parse` """ cmd = ["git", "rev-parse"] if args: cmd.extend(args) with cwd(self.repo_path): return TimedCommand.get_command_output(cmd, graceful=False)
def _get_snyk_vulndb(self): """ :return: retrieve Snyk CVE db """ with tempdir() as vulndb_dir: # clone vulndb git repo self.log.debug("Cloning snyk/vulndb repo") Git.clone(self._VULNDB_GIT_REPO, vulndb_dir) with cwd(vulndb_dir): # install dependencies self.log.debug("Installing snyk/vulndb dependencies") TimedCommand.get_command_output(['npm', 'install']) # generate database (json in file) self.log.debug("Generating snyk/vulndb") TimedCommand.get_command_output([ os.path.join('cli', 'shrink.js'), 'data', self._VULNDB_FILENAME ]) # parse the JSON so we are sure that we have a valid JSON with open(self._VULNDB_FILENAME) as f: return json.load(f)
def config(): """ configure git """ user_name = configuration.git_user_name user_email = configuration.git_user_email if not TimedCommand.get_command_output(["git", "config", "--get", "user.name"]): TimedCommand.get_command_output(["git", "config", "--global", "user.name", user_name]) if not TimedCommand.get_command_output(["git", "config", "--get", "user.email"]): TimedCommand.get_command_output(["git", "config", "--global", "user.email", user_email])
def extract_gem(target, dest): """ extract target gem into $dest/sources and gemspec (renamed to rubygems-metadata.yaml) into $dest/metadata/ """ sources = os.path.join(dest, 'sources') metadata = os.path.join(dest, 'metadata') TimedCommand.get_command_output(['mkdir', '-p', sources, metadata]) TimedCommand.get_command_output(['gem', 'unpack', target, '--target', sources]) with cwd(metadata): # --spec ignores --target, so we need to cwd TimedCommand.get_command_output(['gem', 'unpack', target, '--spec']) metadatayaml = glob.glob('*.gemspec').pop() os.rename(metadatayaml, 'rubygems-metadata.yaml')
def execute(self, arguments): """ task code :param arguments: dictionary with arguments :return: {}, results """ self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) try: cache_path = ObjectCache.get_from_dict(arguments).get_sources() except Exception as e: eco = arguments.get('ecosystem') pkg = arguments.get('name') ver = arguments.get('version') if arguments['ecosystem'] != 'maven': self.log.error( 'Could not get sources for package {e}/{p}/{v}'.format( e=eco, p=pkg, v=ver)) raise self.log.info('Could not get sources for maven package {p}/{v},' 'will try to run on binary jar'.format(p=pkg, v=ver)) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() result_data = {'status': 'unknown', 'summary': {}, 'details': {}} try: result_data['details'] = TimedCommand.get_command_output( ['license_check.py', cache_path], graceful=False, is_json=True) result_data['status'] = result_data['details'].pop('status') result_data['summary'] = result_data['details'].pop('summary') except: self.log.exception("License scan failed") result_data['status'] = 'error' return result_data
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) cache_path = ObjectCache.get_from_dict(arguments).get_extracted_source_tarball() results = {'status': 'unknown', 'summary': {}, 'details': []} try: oscc = TimedCommand.get_command_output(['oscryptocatcher', '--subdir-in-result', cache_path], graceful=False, is_json=True) self.log.debug("oscryptocatcher %s output: %s", cache_path, oscc) results['details'] = oscc['details'] results['summary'] = oscc['summary'] results['status'] = 'success' except: results['status'] = 'error' return results
def execute(self, arguments): self._strict_assert(arguments.get('ecosystem')) self._strict_assert(arguments.get('name')) self._strict_assert(arguments.get('version')) try: cache_path = ObjectCache.get_from_dict(arguments).get_sources() except Exception: eco = arguments.get('ecosystem') pkg = arguments.get('name') ver = arguments.get('version') if arguments['ecosystem'] != 'maven': self.log.error( 'Could not get sources for package {e}/{p}/{v}'.format( e=eco, p=pkg, v=ver)) raise self.log.info('Could not get sources for maven package {p}/{v},' 'will try to run on binary jar'.format(p=pkg, v=ver)) cache_path = ObjectCache.get_from_dict( arguments).get_extracted_source_tarball() result_data = {'status': 'unknown', 'summary': {}, 'details': {}} try: command = [ os.path.join( os.getenv('SCANCODE_PATH', '/opt/scancode-toolkit/'), 'scancode'), # Scan for licenses '--license', # Do not return license matches with scores lower than this score '--license-score', SCANCODE_LICENSE_SCORE, # Files without findings are omitted '--only-findings', # Use n parallel processes '--processes', SCANCODE_PROCESSES, # Do not print summary or progress messages '--quiet', # Strip the root directory segment of all paths '--strip-root', # Stop scanning a file if scanning takes longer than a timeout in seconds '--timeout', SCANCODE_TIMEOUT, cache_path ] output = TimedCommand.get_command_output(command, graceful=False, is_json=True, timeout=600) details = self.process_output(output) result_data['details'] = details result_data['status'] = 'success' result_data['summary'] = { 'sure_licenses': list(details['licenses'].keys()) } except: self.log.exception("License scan failed") result_data['status'] = 'error' return result_data
def _run_owasp_dep_check(self, scan_path, experimental=False): def _clean_dep_check_tmp(): for dcdir in glob.glob(os.path.join(gettempdir(), 'dctemp*')): rmtree(dcdir) s3 = StoragePool.get_connected_storage('S3OWASPDepCheck') depcheck = os.path.join(os.environ['OWASP_DEP_CHECK_PATH'], 'bin', 'dependency-check.sh') with tempdir() as temp_data_dir: retrieved = s3.retrieve_depcheck_db_if_exists(temp_data_dir) if not retrieved: self.log.debug('No cached OWASP Dependency-Check DB, generating fresh now ...') command = [depcheck, '--updateonly', '--data', temp_data_dir] # give DependencyCheck 30 minutes to download the DB TimedCommand.get_command_output(command, graceful=False, timeout=1800) report_path = os.path.join(temp_data_dir, 'report.xml') command = [depcheck, '--noupdate', '--format', 'XML', '--project', 'test', '--data', temp_data_dir, '--scan', scan_path, '--out', report_path] if experimental: command.extend(['--enableExperimental']) output = [] try: self.log.debug('Running OWASP Dependency-Check to scan %s for vulnerabilities' % scan_path) output = TimedCommand.get_command_output(command, graceful=False, timeout=600) # 10 minutes with open(report_path) as r: report_dict = anymarkup.parse(r.read()) except (TaskError, FileNotFoundError) as e: _clean_dep_check_tmp() for line in output: self.log.warning(line) self.log.exception(str(e)) return {'summary': ['OWASP Dependency-Check scan failed'], 'status': 'error', 'details': []} # If the CVEDBSyncTask has never been run before, we just had to create the DB ourselves # Make the life easier for other workers and store it to S3 s3.store_depcheck_db_if_not_exists(temp_data_dir) _clean_dep_check_tmp() results = [] dependencies = report_dict.get('analysis', {}).get('dependencies', {}).get('dependency', []) if not isinstance(dependencies, list): dependencies = [dependencies] for dependency in dependencies: vulnerabilities = dependency.get('vulnerabilities', {}).get('vulnerability', []) if not isinstance(vulnerabilities, list): vulnerabilities = [vulnerabilities] for vulnerability in vulnerabilities: av = vulnerability.get('cvssAccessVector') av = av[0] if av else '?' ac = vulnerability.get('cvssAccessComplexity') ac = ac[0] if ac else '?' au = vulnerability.get('cvssAuthenticationr') au = au[0] if au else '?' c = vulnerability.get('cvssConfidentialImpact') c = c[0] if c else '?' i = vulnerability.get('cvssIntegrityImpact') i = i[0] if i else '?' a = vulnerability.get('cvssAvailabilityImpact') a = a[0] if a else '?' vector = "AV:{AV}/AC:{AC}/Au:{Au}/C:{C}/I:{I}/A:{A}".\ format(AV=av, AC=ac, Au=au, C=c, I=i, A=a) result = { 'cvss': { 'score': vulnerability.get('cvssScore'), 'vector': vector } } references = vulnerability.get('references', {}).get('reference', []) if not isinstance(references, list): references = [references] result['references'] = [r.get('url') for r in references] for field in ['severity', 'description']: result[field] = vulnerability.get(field) result['id'] = vulnerability.get('name') results.append(result) return {'summary': [r['id'] for r in results], 'status': 'success', 'details': results}
def fetch_artifact(ecosystem=None, artifact=None, version=None, target_dir='.'): """ download artifact from registry and process it :param ecosystem: :param artifact: :param version: :param target_dir: :return: tuple: (digest, artifact_path) """ parsed = urlparse(artifact) digest = None artifact_path = None if ecosystem.is_backed_by(EcosystemBackend.pypi): git = Git.create_git(target_dir) # NOTE: we can't download Python packages via pip, because it runs setup.py # even with `pip download`. Therefore we could always get syntax errors # because of older/newer syntax. res = requests.get( 'https://pypi.python.org/pypi/{a}/json'.format(a=artifact)) res.raise_for_status() if not version: version = res.json()['info']['version'] release_files = res.json()['releases'][version] # sort releases by order in which we'd like to download: # 1) sdist # 2) wheels # 3) eggs # 4) anything else (creepy stuff) def release_key(rel): return { 'sdist': 0, 'bdist_wheel': 1, 'bdist_egg': 2 }.get(rel['packagetype'], 3) release_files = list(sorted(release_files, key=release_key)) file_url = release_files[0]['url'] local_filename = IndianaJones.download_file(file_url, target_dir) artifact_path = os.path.join(target_dir, local_filename) digest = compute_digest(artifact_path) Archive.extract(artifact_path, target_dir) git.add_and_commit_everything() elif ecosystem.is_backed_by(EcosystemBackend.npm): git = Git.create_git(target_dir) # $ npm config get cache # /root/.npm cache_path = TimedCommand.get_command_output( ['npm', 'config', 'get', 'cache'], graceful=False).pop() # add package to cache: # /root/.npm/express/ # └── 4.13.4 # ├── package # │ ├── History.md # │ ├── index.js # │ ├── lib # │ ├── LICENSE # │ ├── package.json # │ └── Readme.md # └── package.tgz # 3 directories, 6 files name_ver = artifact if version: name_ver = "{}@{}".format(artifact, version) # make sure the artifact is not in the cache yet TimedCommand.get_command_output( ['npm', 'cache', 'clean', artifact], graceful=False) logger.info("downloading npm module %s", name_ver) npm_command = ['npm', 'cache', 'add', name_ver] TimedCommand.get_command_output(npm_command, graceful=False) # copy tarball to workpath tarball_name = "package.tgz" glob_path = os.path.join(cache_path, artifact, "*") cache_abs_path = os.path.abspath(glob.glob(glob_path).pop()) artifact_path = os.path.join(cache_abs_path, tarball_name) logger.debug("[cache] tarball path = %s", artifact_path) artifact_path = shutil.copy(artifact_path, target_dir) logger.debug("[workdir] tarball path = %s", artifact_path) # Prior to npm-2.x.x (Fedora 24) # npm client was repackaging modules on download. It modified file permissions inside # package.tgz so they matched UID/GID of a user running npm command. Therefore its # digest was different then of a tarball downloaded directly from registry.npmjs.org. digest = compute_digest(artifact_path) Archive.extract(artifact_path, target_dir) # copy package/package.json over the extracted one, # because it contains (since npm >= 2.x.x) more information. npm_package_json = os.path.join(cache_abs_path, 'package', 'package.json') shutil.copy(npm_package_json, target_dir) # copy package/npm-shrinkwrap.json to target_dir npm_shrinkwrap_json = os.path.join(target_dir, 'package', 'npm-shrinkwrap.json') if os.path.isfile(npm_shrinkwrap_json): shutil.copy(npm_shrinkwrap_json, target_dir) git.add_and_commit_everything() elif ecosystem.is_backed_by(EcosystemBackend.rubygems): git = Git.create_git(target_dir) logger.info("downloading rubygems package %s-%s", artifact, version) version_arg = [] if version: version_arg = ['--version', version] gem_command = ['gem', 'fetch', artifact] gem_command.extend(version_arg) with cwd(target_dir): TimedCommand.get_command_output(gem_command, graceful=False) if not version: # if version is None we need to glob for the version that was downloaded artifact_path = os.path.abspath( glob.glob(os.path.join(target_dir, artifact + '*')).pop()) else: artifact_path = os.path.join( target_dir, '{n}-{v}.gem'.format(n=artifact, v=version)) digest = compute_digest(artifact_path) Archive.extract(artifact_path, target_dir) git.add_and_commit_everything() elif ecosystem.is_backed_by(EcosystemBackend.maven): git = Git.create_git(target_dir) artifact_coords = MavenCoordinates.from_str(artifact) # lxml can't handle HTTPS URLs maven_url = "http://repo1.maven.org/maven2/" if not version: version = mvn_find_latest_version(maven_url, artifact_coords) artifact_coords.version = version logger.info("downloading maven package %s", artifact_coords.to_str()) if not artifact_coords.is_valid(): raise ValueError("Invalid Maven coordinates: {a}".format( a=artifact_coords.to_str())) artifact_url = urljoin(maven_url, artifact_coords.to_repo_url()) local_filename = IndianaJones.download_file( artifact_url, target_dir) if local_filename is None: raise RuntimeError("Unable to download: %s" % artifact_url) artifact_path = os.path.join( target_dir, os.path.split(artifact_coords.to_repo_url())[1]) digest = compute_digest(artifact_path) if artifact_coords.packaging != 'pom': Archive.extract(artifact_path, target_dir) git.add_and_commit_everything() elif ecosystem.is_backed_by(EcosystemBackend.scm): git = Git.clone(artifact, target_dir) digest = IndianaJones.get_revision(target_dir) artifact_path = git.archive(artifact) elif parsed: if parsed[0] == 'git' or parsed[2].endswith('.git'): git = Git.clone(artifact, target_dir) digest = IndianaJones.get_revision(target_dir) artifact_path = git.archive(artifact) return digest, artifact_path
def get_revision(target_directory): """ Get digest of last commit """ with cwd(target_directory): return TimedCommand.get_command_output( ['git', 'rev-parse', 'HEAD'], graceful=False).pop()
def extract_tar(target, dest): TimedCommand.get_command_output(['tar', 'xf', target, '-C', dest])
def extract_zip(target, dest): # -o: overwrite existing files without prompting TimedCommand.get_command_output(['unzip', '-o', '-d', dest, target]) # Fix possibly wrong permissions in zip files that would prevent us from deleting files. TimedCommand.get_command_output(['chmod', '-R', 'u+rwX,g+rwX', dest])
def _update_dep_check_db(self, data_dir): depcheck = os.path.join(os.environ['OWASP_DEP_CHECK_PATH'], 'bin', 'dependency-check.sh') self.log.debug('Updating OWASP Dependency-Check CVE DB') TimedCommand.get_command_output( [depcheck, '--updateonly', '--data', data_dir], timeout=1800)