def extract_dependencies(github_repo, github_sha): """Extract the dependencies information. Currently assuming repository is maven/npm/python repository. :param github_repo: repository url :param github_sha: commit hash :return: set of direct (and indirect) dependencies """ with TemporaryDirectory() as workdir: repo = Git.clone(url=github_repo, path=workdir, timeout=3600) repo.reset(revision=github_sha, hard=True) with cwd(repo.repo_path): # TODO: Make this task also work for files not present in root directory. # First change the package-lock.json to npm-shrinkwrap.json GithubDependencyTreeTask.change_package_lock_to_shrinkwrap() if peek(Path.cwd().glob("pom.xml")): return GithubDependencyTreeTask.get_maven_dependencies() elif peek(Path.cwd().glob("npm-shrinkwrap.json")) \ or peek(Path.cwd().glob("package.json")): return GithubDependencyTreeTask.get_npm_dependencies( repo.repo_path) elif peek(Path.cwd().glob("requirements.txt")): return GithubDependencyTreeTask.get_python_dependencies( repo.repo_path) elif peek(Path.cwd().glob("glide.lock")): return GithubDependencyTreeTask.get_go_glide_dependencies( repo.repo_path) elif peek(Path.cwd().glob("Gopkg.lock")): return GithubDependencyTreeTask.get_go_pkg_dependencies() else: raise TaskError("Please provide maven or npm or " "python or Go repository for scanning!")
def add(self, path): """Add path to index. :param path: str """ with cwd(self.repo_path): TimedCommand.get_command_output(["git", "add", path], graceful=False)
def reset(self, revision, hard=False): """Run 'git reset'.""" cmd = ["git", "reset", revision] if hard: cmd.extend(["--hard"]) with cwd(self.repo_path): TimedCommand.get_command_output(cmd, graceful=False)
def store_victims_db(self, victims_db_dir): """ Zip victims_db_dir/* and store to S3 as VICTIMS_DB_ARCHIVE""" with tempdir() as temp_archive_dir: temp_archive_path = os.path.join(temp_archive_dir, self.VICTIMS_DB_ARCHIVE) with cwd(victims_db_dir): Archive.zip_file('.', temp_archive_path) self.store_file(temp_archive_path, self.VICTIMS_DB_ARCHIVE)
def commit(self, message='blank'): """Commit git repository. :param message: str, commit message """ # --git-dir is #$%^&& # http://stackoverflow.com/questions/1386291/git-git-dir-not-working-as-expected with cwd(self.repo_path): TimedCommand.get_command_output(["git", "commit", "-m", message], graceful=False)
def rev_parse(self, args=None): """Run git rev-parse. :param args: arguments to pass to `git rev-parse` :return: [str], output from `git rev-parse` """ cmd = ["git", "rev-parse"] if args: cmd.extend(args) with cwd(self.repo_path): return TimedCommand.get_command_output(cmd, graceful=False)
def _resolve_versions(to_solve): """Resolve version ranges in to_solve. :param to_solve: {"groupId:artifactId": "version-range"} :return: {"groupId:artifactId": "version"} """ if not to_solve: return {} with TemporaryDirectory() as tmpdir: with cwd(tmpdir): MavenSolver._generate_pom_xml(to_solve) return MavenSolver._dependencies_from_pom_xml()
def get_manifest_file_from_git_repo(git_repo_url): repo = "" with TemporaryDirectory() as workdir: try: repo = Git.clone(url=git_repo_url, path="/tmp/") except Exception as e: print ("Exception %r" % e) raise with cwd(repo.repo_path): if peek(Path.cwd().glob("pom.xml")): print ('{}/pom.xml'.format(Path.cwd())) f = open('{}/pom.xml'.format(Path.cwd())) return f return None
def fetch_scm_artifact(name, version, target_dir): env = dict(os.environ) env['GOPATH'] = target_dir TimedCommand.get_command_output(['go', 'get', '-d', name], timeout=300, env=env, graceful=True) package_dir = os.path.join(target_dir, 'src', name) with cwd(package_dir): git = Git(package_dir) git.reset(version, hard=True) artifact_filename = git.archive(version) artifact_path = os.path.join(package_dir, artifact_filename) digest = compute_digest(artifact_path) return digest, artifact_path
def extract_gem(target, dest): """Extract target gem and gemspec. Gem into $dest/sources Gemspec (renamed to rubygems-metadata.yaml) into $dest/metadata/ """ sources = os.path.join(dest, 'sources') metadata = os.path.join(dest, 'metadata') TimedCommand.get_command_output(['mkdir', '-p', sources, metadata]) TimedCommand.get_command_output(['gem', 'unpack', target, '--target', sources]) with cwd(metadata): # --spec ignores --target, so we need to cwd TimedCommand.get_command_output(['gem', 'unpack', target, '--spec']) metadatayaml = glob.glob('*.gemspec').pop() os.rename(metadatayaml, 'rubygems-metadata.yaml')
def archive(self, basename, sub_path=None): """Create an archive; simply calls `git archive`. :param basename: str, name of the resulting archive, without file extension (suffix) :param sub_path: str, only add files found under this path to the archive; default: add all files from the repository (.git/ is always excluded) :return: str, filename """ suffix = "tar.gz" filename = basename + "." + suffix with cwd(self.repo_path): cmd = [ "git", "archive", "--format={}".format(suffix), "--output={}".format(filename), "HEAD" ] if sub_path: cmd.append(sub_path) TimedCommand.get_command_output(cmd) return filename
def fetch_go_artifact(name, version, target_dir): """Fetch go artifact using 'go get' command.""" env = dict(os.environ) env['GOPATH'] = target_dir Git.config() try: TimedCommand.get_command_output(['go', 'get', '-d', name], timeout=300, env=env, graceful=False) except TaskError: raise NotABugTaskError('Unable to go-get {n}'.format(n=name)) package_dir = os.path.join(target_dir, 'src', name) with cwd(package_dir): git = Git(package_dir) git.reset(version, hard=True) artifact_filename = git.archive(version) artifact_path = os.path.join(package_dir, artifact_filename) digest = compute_digest(artifact_path) return digest, artifact_path
def archive(self, basename, basedir=None, sub_path=None, format="tar.gz"): """Create an archive; simply calls `git archive`. :param basename: str, name of the resulting archive, without file extension (suffix) :param basedir: str, path to a directory where to store the resulting archive :param sub_path: str, only add files found under this path to the archive; default: add all files from the repository (.git/ is always excluded) :param format: str, format of the resulting archive, default: 'tar.gz' :return: str, filename """ filename = os.path.join(basedir or "", basename + "." + format) with cwd(self.repo_path): cmd = [ "git", "archive", "--format={}".format(format), "--output={}".format(filename), "HEAD" ] if sub_path: cmd.append(sub_path) TimedCommand.get_command_output(cmd) return filename
def extract_dependencies(github_repo, github_sha): """Extract the dependencies information. Currently assuming repository is maven repository. """ with TemporaryDirectory() as workdir: repo = Git.clone(url=github_repo, path=workdir, timeout=3600) repo.reset(revision=github_sha, hard=True) with cwd(repo.repo_path): output_file = Path.cwd() / "dependency-tree.txt" cmd = ["mvn", "org.apache.maven.plugins:maven-dependency-plugin:3.0.2:tree", "-DoutputType=dot", "-DoutputFile={filename}".format(filename=output_file), "-DappendOutput=true"] timed_cmd = TimedCommand(cmd) status, output, _ = timed_cmd.run(timeout=3600) if status != 0 or not output_file.is_file(): # all errors are in stdout, not stderr raise TaskError(output) with output_file.open() as f: return GithubDependencyTreeTask.parse_maven_dependency_tree(f.readlines())
def extract_dependencies(github_repo, github_sha=None, user_flow=False): """Extract the dependencies information. Currently assuming repository is maven/npm/python repository. :param user_flow: to indicate if user flow is invoked :param github_repo: repository url :param github_sha: commit hash :return: set of direct (and indirect) dependencies """ with TemporaryDirectory() as workdir: repo = Git.clone(url=github_repo, path=workdir, timeout=3600) if github_sha is not None: repo.reset(revision=github_sha, hard=True) with cwd(repo.repo_path): # TODO: Make this task also work for files not present in root directory. # First change the package-lock.json to npm-shrinkwrap.json GithubDependencyTreeTask.change_package_lock_to_shrinkwrap() # Since user flow is only called for maven, we pass this flag only to maven if peek(Path.cwd().glob("pom.xml")): return GithubDependencyTreeTask.get_maven_dependencies( user_flow) elif peek(Path.cwd().glob("npm-shrinkwrap.json")) \ or peek(Path.cwd().glob("package.json")): return GithubDependencyTreeTask.get_npm_dependencies( repo.repo_path) elif peek(Path.cwd().glob("requirements.txt")): return GithubDependencyTreeTask.get_python_dependencies( repo.repo_path) elif peek(Path.cwd().glob("glide.lock")): return GithubDependencyTreeTask.get_go_glide_dependencies( repo.repo_path) elif peek(Path.cwd().glob("Gopkg.lock")): return GithubDependencyTreeTask.get_go_pkg_dependencies() else: return None
def fetch_rubygems_artifact(name, version, target_dir): git = Git.create_git(target_dir) logger.info("downloading rubygems package %s-%s", name, version) version_arg = [] if version: version_arg = ['--version', version] gem_command = ['gem', 'fetch', name] gem_command.extend(version_arg) with cwd(target_dir): TimedCommand.get_command_output(gem_command, graceful=False) if not version: # if version is None we need to glob for the version that was downloaded artifact_path = os.path.abspath( glob.glob(os.path.join(target_dir, name + '*')).pop()) else: artifact_path = os.path.join( target_dir, '{n}-{v}.gem'.format(n=name, v=version)) digest = compute_digest(artifact_path) Archive.extract(artifact_path, target_dir) git.add_and_commit_everything() return digest, artifact_path
def get_revision(target_directory): """Get digest of last commit.""" with cwd(target_directory): return TimedCommand.get_command_output( ['git', 'rev-parse', 'HEAD'], graceful=False).pop()
def _use_maven_index_checker(self): maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH') target_dir = os.path.join(maven_index_checker_dir, 'target') central_index_dir = os.path.join(target_dir, 'central-index') timestamp_path = os.path.join(central_index_dir, 'timestamp') s3 = StoragePool.get_connected_storage('S3MavenIndex') self.log.info('Fetching pre-built maven index from S3, if available.') s3.retrieve_index_if_exists(target_dir) old_timestamp = 0 try: old_timestamp = int(os.stat(timestamp_path).st_mtime) except OSError: self.log.info( 'Timestamp is missing, we will probably need to build the index from scratch.' ) pass java_temp_dir = tempfile.mkdtemp() index_range = '{}-{}'.format(self.count.min, self.count.max) command = [ 'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir), '-jar', 'maven-index-checker.jar', '-r', index_range ] if self.latest_version_only: command.append('-l') with cwd(maven_index_checker_dir): try: output = TimedCommand.get_command_output(command, is_json=True, graceful=False, timeout=1200) new_timestamp = int(os.stat(timestamp_path).st_mtime) if old_timestamp != new_timestamp: self.log.info('Storing pre-built maven index to S3...') s3.store_index(target_dir) self.log.debug('Stored. Index in S3 is up-to-date.') else: self.log.info('Index in S3 is up-to-date.') except TaskError as e: self.log.exception(e) finally: rmtree(central_index_dir) self.log.debug('central-index/ deleted') rmtree(java_temp_dir) s3data = StoragePool.get_connected_storage('S3Data') bucket = s3data._s3.Bucket(s3data.bucket_name) for idx, release in enumerate(output): name = '{}:{}'.format(release['groupId'], release['artifactId']) version = release['version'] # For now (can change in future) we want to analyze only ONE version of each package try: next( iter( bucket.objects.filter(Prefix='{e}/{p}/'.format( e=self.ecosystem, p=name)).limit(1))) self.log.info( "Analysis of some version of %s has already been scheduled, " "skipping version %s", name, version) continue except StopIteration: self.log.info("Scheduling #%d.", self.count.min + idx) self.analyses_selinon_flow(name, version)
def execute(self): self.log.info("Checking maven index for new releases") maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH') target_dir = os.path.join(maven_index_checker_dir, 'target') central_index_dir = os.path.join(target_dir, 'central-index') timestamp_path = os.path.join(central_index_dir, 'timestamp') s3 = StoragePool.get_connected_storage('S3MavenIndex') self.log.info('Fetching pre-built maven index from S3, if available.') s3.retrieve_index_if_exists(target_dir) old_timestamp = 0 try: old_timestamp = int(os.stat(timestamp_path).st_mtime) except OSError: self.log.info( 'Timestamp is missing, we need to build the index from scratch.' ) pass last_offset = s3.get_last_offset() with tempdir() as java_temp_dir: cmd = [ 'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir), '-jar', 'maven-index-checker.jar', '-c' ] with cwd(maven_index_checker_dir): output = TimedCommand.get_command_output(cmd, is_json=True, graceful=False, timeout=1200) current_count = output['count'] new_timestamp = int(os.stat(timestamp_path).st_mtime) if old_timestamp != new_timestamp: self.log.info('Storing pre-built maven index to S3...') s3.store_index(target_dir) self.log.debug('Stored. Index in S3 is up-to-date.') if old_timestamp == 0: s3.set_last_offset(current_count) self.log.info( 'This is first run, i.e. all packages are considered new. ' 'Skipping scheduling to not analyze all packages in index.' ) return else: self.log.info('Index in S3 is up-to-date.') self.log.debug( "Number of entries in maven indexer: %d, " "last offset used: %d", current_count, last_offset) to_schedule_count = current_count - last_offset if to_schedule_count == 0: self.log.info("No new packages to schedule, exiting...") return cmd = [ 'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir), '-jar', 'maven-index-checker.jar', '-r', '0-{}'.format(to_schedule_count) ] output = TimedCommand.get_command_output(cmd, is_json=True, graceful=False, timeout=1200) self.log.info( "Found %d new packages to analyse, scheduling analyses...", len(output)) for entry in output: self.run_selinon_flow( 'bayesianFlow', { 'ecosystem': 'maven', 'name': '{groupId}:{artifactId}'.format(**entry), 'version': entry['version'], 'recursive_limit': 0 }) s3.set_last_offset(current_count) self.log.info( "All new maven releases scheduled for analysis, exiting..")
def execute(self): """Start the analysis.""" self.log.info("Checking maven index for new releases") maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH') maven_index_checker_data_dir = os.environ.get( 'MAVEN_INDEX_CHECKER_DATA_PATH', '/tmp/index-checker') os.makedirs(maven_index_checker_data_dir, exist_ok=True) central_index_dir = os.path.join(maven_index_checker_data_dir, 'central-index') timestamp_path = os.path.join(central_index_dir, 'timestamp') self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses", "maven_index_checker_dir", maven_index_checker_dir)) self.log.info("{}__:__{}__:__{}".format( "MavenReleasesAnalyses", "maven_index_checker_data_dir", maven_index_checker_data_dir)) self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses", "central_index_dir", central_index_dir)) self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses", "timestamp_path", timestamp_path)) s3 = StoragePool.get_connected_storage('S3MavenIndex') self.log.info('Fetching pre-built maven index from S3, if available.') s3.retrieve_index_if_exists(maven_index_checker_data_dir) old_timestamp = 0 try: old_timestamp = int(os.stat(timestamp_path).st_mtime) except OSError: self.log.info( 'Timestamp is missing, we need to build the index from scratch.' ) pass last_offset = s3.get_last_offset() self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses", "last_offset", last_offset)) java_temp_dir = tempfile.mkdtemp(prefix='tmp-', dir=os.environ.get('PV_DIR', '/tmp')) cmd = [ 'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir), '-DcentralIndexDir={}'.format(central_index_dir), '-jar', 'maven-index-checker.jar', '-c' ] self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses", "cmd1", cmd)) with cwd(maven_index_checker_dir): try: output = TimedCommand.get_command_output(cmd, is_json=True, graceful=False, timeout=10800) self.log.info("{}__:__{}__:__{}".format( "MavenReleasesAnalyses", "output", output)) current_count = output['count'] new_timestamp = int(os.stat(timestamp_path).st_mtime) if old_timestamp != new_timestamp: self.log.info('Storing pre-built maven index to S3...') s3.store_index(maven_index_checker_data_dir) self.log.debug('Stored. Index in S3 is up-to-date.') if old_timestamp == 0: s3.set_last_offset(current_count) self.log.info( 'This is first run, i.e. all packages are considered new. ' 'Skipping scheduling to not analyze all packages in index.' ) return else: self.log.info('Index in S3 is up-to-date.') self.log.debug( "Number of entries in maven indexer: %d, " "last offset used: %d", current_count, last_offset) to_schedule_count = current_count - last_offset if to_schedule_count == 0: self.log.info("No new packages to schedule, exiting...") return cmd = [ 'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir), '-DcentralIndexDir={}'.format(central_index_dir), '-jar', 'maven-index-checker.jar', '-r', '0-{}'.format(to_schedule_count) ] self.log.info("{}__:__{}__:__{}".format( "MavenReleasesAnalyses", "cmd2", cmd)) output = TimedCommand.get_command_output(cmd, is_json=True, graceful=False, timeout=10800) self.log.info("{}__:__{}__:__{}".format( "MavenReleasesAnalyses", "output", output)) except TaskError as e: self.log.info("{}__:__{}__:__{}".format( "MavenReleasesAnalyses", "TaskError", e)) self.log.exception(e) raise finally: rmtree(central_index_dir) self.log.debug('central-index/ deleted') rmtree(java_temp_dir) self.log.info( "Found %d new packages to analyse, scheduling analyses...", len(output)) for entry in output: self.log.info("{}__:__{}__:__{}".format( "MavenReleasesAnalyses", "Running ingestion for", entry)) self.run_selinon_flow( 'bayesianFlow', { 'ecosystem': 'maven', 'name': '{groupId}:{artifactId}'.format(**entry), 'version': entry['version'], 'recursive_limit': 0 }) self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses", "current_count", current_count)) s3.set_last_offset(current_count) self.log.info( "All new maven releases scheduled for analysis, exiting..")