Esempio n. 1
0
    def extract_dependencies(github_repo, github_sha):
        """Extract the dependencies information.

        Currently assuming repository is maven/npm/python repository.

        :param github_repo: repository url
        :param github_sha: commit hash
        :return: set of direct (and indirect) dependencies
        """
        with TemporaryDirectory() as workdir:
            repo = Git.clone(url=github_repo, path=workdir, timeout=3600)
            repo.reset(revision=github_sha, hard=True)
            with cwd(repo.repo_path):
                # TODO: Make this task also work for files not present in root directory.

                # First change the package-lock.json to npm-shrinkwrap.json
                GithubDependencyTreeTask.change_package_lock_to_shrinkwrap()

                if peek(Path.cwd().glob("pom.xml")):
                    return GithubDependencyTreeTask.get_maven_dependencies()
                elif peek(Path.cwd().glob("npm-shrinkwrap.json")) \
                        or peek(Path.cwd().glob("package.json")):
                    return GithubDependencyTreeTask.get_npm_dependencies(
                        repo.repo_path)
                elif peek(Path.cwd().glob("requirements.txt")):
                    return GithubDependencyTreeTask.get_python_dependencies(
                        repo.repo_path)
                elif peek(Path.cwd().glob("glide.lock")):
                    return GithubDependencyTreeTask.get_go_glide_dependencies(
                        repo.repo_path)
                elif peek(Path.cwd().glob("Gopkg.lock")):
                    return GithubDependencyTreeTask.get_go_pkg_dependencies()
                else:
                    raise TaskError("Please provide maven or npm or "
                                    "python or Go repository for scanning!")
Esempio n. 2
0
    def add(self, path):
        """Add path to index.

        :param path: str
        """
        with cwd(self.repo_path):
            TimedCommand.get_command_output(["git", "add", path], graceful=False)
 def reset(self, revision, hard=False):
     """Run 'git reset'."""
     cmd = ["git", "reset", revision]
     if hard:
         cmd.extend(["--hard"])
     with cwd(self.repo_path):
         TimedCommand.get_command_output(cmd, graceful=False)
 def store_victims_db(self, victims_db_dir):
     """ Zip victims_db_dir/* and store to S3 as VICTIMS_DB_ARCHIVE"""
     with tempdir() as temp_archive_dir:
         temp_archive_path = os.path.join(temp_archive_dir,
                                          self.VICTIMS_DB_ARCHIVE)
         with cwd(victims_db_dir):
             Archive.zip_file('.', temp_archive_path)
             self.store_file(temp_archive_path, self.VICTIMS_DB_ARCHIVE)
Esempio n. 5
0
    def commit(self, message='blank'):
        """Commit git repository.

        :param message: str, commit message
        """
        # --git-dir is #$%^&&
        # http://stackoverflow.com/questions/1386291/git-git-dir-not-working-as-expected
        with cwd(self.repo_path):
            TimedCommand.get_command_output(["git", "commit", "-m", message], graceful=False)
    def rev_parse(self, args=None):
        """Run git rev-parse.

        :param args: arguments to pass to `git rev-parse`
        :return: [str], output from `git rev-parse`
        """
        cmd = ["git", "rev-parse"]
        if args:
            cmd.extend(args)

        with cwd(self.repo_path):
            return TimedCommand.get_command_output(cmd, graceful=False)
Esempio n. 7
0
    def _resolve_versions(to_solve):
        """Resolve version ranges in to_solve.

        :param to_solve: {"groupId:artifactId": "version-range"}
        :return: {"groupId:artifactId": "version"}
        """
        if not to_solve:
            return {}
        with TemporaryDirectory() as tmpdir:
            with cwd(tmpdir):
                MavenSolver._generate_pom_xml(to_solve)
                return MavenSolver._dependencies_from_pom_xml()
def get_manifest_file_from_git_repo(git_repo_url):
    repo = ""
    with TemporaryDirectory() as workdir:
        try:
            repo = Git.clone(url=git_repo_url, path="/tmp/")
        except Exception as e:
            print ("Exception %r" % e)
            raise

        with cwd(repo.repo_path):
            if peek(Path.cwd().glob("pom.xml")):
                print ('{}/pom.xml'.format(Path.cwd()))
                f = open('{}/pom.xml'.format(Path.cwd()))
                return f
    return None
 def fetch_scm_artifact(name, version, target_dir):
     env = dict(os.environ)
     env['GOPATH'] = target_dir
     TimedCommand.get_command_output(['go', 'get', '-d', name],
                                     timeout=300,
                                     env=env,
                                     graceful=True)
     package_dir = os.path.join(target_dir, 'src', name)
     with cwd(package_dir):
         git = Git(package_dir)
         git.reset(version, hard=True)
         artifact_filename = git.archive(version)
         artifact_path = os.path.join(package_dir, artifact_filename)
         digest = compute_digest(artifact_path)
         return digest, artifact_path
Esempio n. 10
0
    def extract_gem(target, dest):
        """Extract target gem and gemspec.

        Gem into $dest/sources
        Gemspec (renamed to rubygems-metadata.yaml) into $dest/metadata/
        """
        sources = os.path.join(dest, 'sources')
        metadata = os.path.join(dest, 'metadata')
        TimedCommand.get_command_output(['mkdir', '-p', sources, metadata])
        TimedCommand.get_command_output(['gem', 'unpack', target, '--target', sources])
        with cwd(metadata):
            # --spec ignores --target, so we need to cwd
            TimedCommand.get_command_output(['gem', 'unpack', target, '--spec'])
            metadatayaml = glob.glob('*.gemspec').pop()
            os.rename(metadatayaml, 'rubygems-metadata.yaml')
Esempio n. 11
0
    def archive(self, basename, sub_path=None):
        """Create an archive; simply calls `git archive`.

        :param basename: str, name of the resulting archive, without file extension (suffix)
        :param sub_path: str, only add files found under this path to the archive;
                          default: add all files from the repository (.git/ is always excluded)
        :return: str, filename
        """
        suffix = "tar.gz"
        filename = basename + "." + suffix
        with cwd(self.repo_path):
            cmd = [
                "git", "archive", "--format={}".format(suffix),
                "--output={}".format(filename), "HEAD"
            ]
            if sub_path:
                cmd.append(sub_path)
            TimedCommand.get_command_output(cmd)

        return filename
Esempio n. 12
0
 def fetch_go_artifact(name, version, target_dir):
     """Fetch go artifact using 'go get' command."""
     env = dict(os.environ)
     env['GOPATH'] = target_dir
     Git.config()
     try:
         TimedCommand.get_command_output(['go', 'get', '-d', name],
                                         timeout=300,
                                         env=env,
                                         graceful=False)
     except TaskError:
         raise NotABugTaskError('Unable to go-get {n}'.format(n=name))
     package_dir = os.path.join(target_dir, 'src', name)
     with cwd(package_dir):
         git = Git(package_dir)
         git.reset(version, hard=True)
         artifact_filename = git.archive(version)
         artifact_path = os.path.join(package_dir, artifact_filename)
         digest = compute_digest(artifact_path)
         return digest, artifact_path
Esempio n. 13
0
    def archive(self, basename, basedir=None, sub_path=None, format="tar.gz"):
        """Create an archive; simply calls `git archive`.

        :param basename: str, name of the resulting archive, without file extension (suffix)
        :param basedir: str, path to a directory where to store the resulting archive
        :param sub_path: str, only add files found under this path to the archive;
                          default: add all files from the repository (.git/ is always excluded)
        :param format: str, format of the resulting archive, default: 'tar.gz'
        :return: str, filename
        """
        filename = os.path.join(basedir or "", basename + "." + format)
        with cwd(self.repo_path):
            cmd = [
                "git", "archive", "--format={}".format(format),
                "--output={}".format(filename), "HEAD"
            ]
            if sub_path:
                cmd.append(sub_path)
            TimedCommand.get_command_output(cmd)

        return filename
    def extract_dependencies(github_repo, github_sha):
        """Extract the dependencies information.

        Currently assuming repository is maven repository.
        """
        with TemporaryDirectory() as workdir:
            repo = Git.clone(url=github_repo, path=workdir, timeout=3600)
            repo.reset(revision=github_sha, hard=True)
            with cwd(repo.repo_path):
                output_file = Path.cwd() / "dependency-tree.txt"
                cmd = ["mvn", "org.apache.maven.plugins:maven-dependency-plugin:3.0.2:tree",
                       "-DoutputType=dot",
                       "-DoutputFile={filename}".format(filename=output_file),
                       "-DappendOutput=true"]
                timed_cmd = TimedCommand(cmd)
                status, output, _ = timed_cmd.run(timeout=3600)
                if status != 0 or not output_file.is_file():
                    # all errors are in stdout, not stderr
                    raise TaskError(output)
                with output_file.open() as f:
                    return GithubDependencyTreeTask.parse_maven_dependency_tree(f.readlines())
Esempio n. 15
0
    def extract_dependencies(github_repo, github_sha=None, user_flow=False):
        """Extract the dependencies information.

        Currently assuming repository is maven/npm/python repository.

        :param user_flow: to indicate if user flow is invoked
        :param github_repo: repository url
        :param github_sha: commit hash
        :return: set of direct (and indirect) dependencies
        """
        with TemporaryDirectory() as workdir:
            repo = Git.clone(url=github_repo, path=workdir, timeout=3600)
            if github_sha is not None:
                repo.reset(revision=github_sha, hard=True)
            with cwd(repo.repo_path):
                # TODO: Make this task also work for files not present in root directory.

                # First change the package-lock.json to npm-shrinkwrap.json
                GithubDependencyTreeTask.change_package_lock_to_shrinkwrap()

                # Since user flow is only called for maven, we pass this flag only to maven
                if peek(Path.cwd().glob("pom.xml")):
                    return GithubDependencyTreeTask.get_maven_dependencies(
                        user_flow)
                elif peek(Path.cwd().glob("npm-shrinkwrap.json")) \
                        or peek(Path.cwd().glob("package.json")):
                    return GithubDependencyTreeTask.get_npm_dependencies(
                        repo.repo_path)
                elif peek(Path.cwd().glob("requirements.txt")):
                    return GithubDependencyTreeTask.get_python_dependencies(
                        repo.repo_path)
                elif peek(Path.cwd().glob("glide.lock")):
                    return GithubDependencyTreeTask.get_go_glide_dependencies(
                        repo.repo_path)
                elif peek(Path.cwd().glob("Gopkg.lock")):
                    return GithubDependencyTreeTask.get_go_pkg_dependencies()
                else:
                    return None
Esempio n. 16
0
    def fetch_rubygems_artifact(name, version, target_dir):
        git = Git.create_git(target_dir)
        logger.info("downloading rubygems package %s-%s", name, version)
        version_arg = []
        if version:
            version_arg = ['--version', version]
        gem_command = ['gem', 'fetch', name]
        gem_command.extend(version_arg)
        with cwd(target_dir):
            TimedCommand.get_command_output(gem_command, graceful=False)

        if not version:
            # if version is None we need to glob for the version that was downloaded
            artifact_path = os.path.abspath(
                glob.glob(os.path.join(target_dir, name + '*')).pop())
        else:
            artifact_path = os.path.join(
                target_dir, '{n}-{v}.gem'.format(n=name, v=version))

        digest = compute_digest(artifact_path)
        Archive.extract(artifact_path, target_dir)
        git.add_and_commit_everything()
        return digest, artifact_path
Esempio n. 17
0
 def get_revision(target_directory):
     """Get digest of last commit."""
     with cwd(target_directory):
         return TimedCommand.get_command_output(
             ['git', 'rev-parse', 'HEAD'], graceful=False).pop()
    def _use_maven_index_checker(self):
        maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH')
        target_dir = os.path.join(maven_index_checker_dir, 'target')
        central_index_dir = os.path.join(target_dir, 'central-index')
        timestamp_path = os.path.join(central_index_dir, 'timestamp')

        s3 = StoragePool.get_connected_storage('S3MavenIndex')
        self.log.info('Fetching pre-built maven index from S3, if available.')
        s3.retrieve_index_if_exists(target_dir)

        old_timestamp = 0
        try:
            old_timestamp = int(os.stat(timestamp_path).st_mtime)
        except OSError:
            self.log.info(
                'Timestamp is missing, we will probably need to build the index from scratch.'
            )
            pass

        java_temp_dir = tempfile.mkdtemp()

        index_range = '{}-{}'.format(self.count.min, self.count.max)
        command = [
            'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir),
            '-jar', 'maven-index-checker.jar', '-r', index_range
        ]
        if self.latest_version_only:
            command.append('-l')
        with cwd(maven_index_checker_dir):
            try:
                output = TimedCommand.get_command_output(command,
                                                         is_json=True,
                                                         graceful=False,
                                                         timeout=1200)

                new_timestamp = int(os.stat(timestamp_path).st_mtime)
                if old_timestamp != new_timestamp:
                    self.log.info('Storing pre-built maven index to S3...')
                    s3.store_index(target_dir)
                    self.log.debug('Stored. Index in S3 is up-to-date.')
                else:
                    self.log.info('Index in S3 is up-to-date.')
            except TaskError as e:
                self.log.exception(e)
            finally:
                rmtree(central_index_dir)
                self.log.debug('central-index/ deleted')
                rmtree(java_temp_dir)

            s3data = StoragePool.get_connected_storage('S3Data')
            bucket = s3data._s3.Bucket(s3data.bucket_name)
            for idx, release in enumerate(output):
                name = '{}:{}'.format(release['groupId'],
                                      release['artifactId'])
                version = release['version']
                # For now (can change in future) we want to analyze only ONE version of each package
                try:
                    next(
                        iter(
                            bucket.objects.filter(Prefix='{e}/{p}/'.format(
                                e=self.ecosystem, p=name)).limit(1)))
                    self.log.info(
                        "Analysis of some version of %s has already been scheduled, "
                        "skipping version %s", name, version)
                    continue
                except StopIteration:
                    self.log.info("Scheduling #%d.", self.count.min + idx)
                    self.analyses_selinon_flow(name, version)
    def execute(self):
        self.log.info("Checking maven index for new releases")
        maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH')
        target_dir = os.path.join(maven_index_checker_dir, 'target')
        central_index_dir = os.path.join(target_dir, 'central-index')
        timestamp_path = os.path.join(central_index_dir, 'timestamp')

        s3 = StoragePool.get_connected_storage('S3MavenIndex')
        self.log.info('Fetching pre-built maven index from S3, if available.')
        s3.retrieve_index_if_exists(target_dir)

        old_timestamp = 0
        try:
            old_timestamp = int(os.stat(timestamp_path).st_mtime)
        except OSError:
            self.log.info(
                'Timestamp is missing, we need to build the index from scratch.'
            )
            pass

        last_offset = s3.get_last_offset()
        with tempdir() as java_temp_dir:
            cmd = [
                'java', '-Xmx768m',
                '-Djava.io.tmpdir={}'.format(java_temp_dir), '-jar',
                'maven-index-checker.jar', '-c'
            ]

            with cwd(maven_index_checker_dir):
                output = TimedCommand.get_command_output(cmd,
                                                         is_json=True,
                                                         graceful=False,
                                                         timeout=1200)

                current_count = output['count']
                new_timestamp = int(os.stat(timestamp_path).st_mtime)
                if old_timestamp != new_timestamp:
                    self.log.info('Storing pre-built maven index to S3...')
                    s3.store_index(target_dir)
                    self.log.debug('Stored. Index in S3 is up-to-date.')
                    if old_timestamp == 0:
                        s3.set_last_offset(current_count)
                        self.log.info(
                            'This is first run, i.e. all packages are considered new. '
                            'Skipping scheduling to not analyze all packages in index.'
                        )
                        return
                else:
                    self.log.info('Index in S3 is up-to-date.')

                self.log.debug(
                    "Number of entries in maven indexer: %d, "
                    "last offset used: %d", current_count, last_offset)
                to_schedule_count = current_count - last_offset
                if to_schedule_count == 0:
                    self.log.info("No new packages to schedule, exiting...")
                    return

                cmd = [
                    'java', '-Xmx768m',
                    '-Djava.io.tmpdir={}'.format(java_temp_dir), '-jar',
                    'maven-index-checker.jar', '-r',
                    '0-{}'.format(to_schedule_count)
                ]
                output = TimedCommand.get_command_output(cmd,
                                                         is_json=True,
                                                         graceful=False,
                                                         timeout=1200)

                self.log.info(
                    "Found %d new packages to analyse, scheduling analyses...",
                    len(output))
                for entry in output:
                    self.run_selinon_flow(
                        'bayesianFlow', {
                            'ecosystem': 'maven',
                            'name': '{groupId}:{artifactId}'.format(**entry),
                            'version': entry['version'],
                            'recursive_limit': 0
                        })

        s3.set_last_offset(current_count)
        self.log.info(
            "All new maven releases scheduled for analysis, exiting..")
Esempio n. 20
0
    def execute(self):
        """Start the analysis."""
        self.log.info("Checking maven index for new releases")
        maven_index_checker_dir = os.getenv('MAVEN_INDEX_CHECKER_PATH')
        maven_index_checker_data_dir = os.environ.get(
            'MAVEN_INDEX_CHECKER_DATA_PATH', '/tmp/index-checker')
        os.makedirs(maven_index_checker_data_dir, exist_ok=True)
        central_index_dir = os.path.join(maven_index_checker_data_dir,
                                         'central-index')
        timestamp_path = os.path.join(central_index_dir, 'timestamp')

        self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses",
                                                "maven_index_checker_dir",
                                                maven_index_checker_dir))
        self.log.info("{}__:__{}__:__{}".format(
            "MavenReleasesAnalyses", "maven_index_checker_data_dir",
            maven_index_checker_data_dir))
        self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses",
                                                "central_index_dir",
                                                central_index_dir))
        self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses",
                                                "timestamp_path",
                                                timestamp_path))

        s3 = StoragePool.get_connected_storage('S3MavenIndex')
        self.log.info('Fetching pre-built maven index from S3, if available.')
        s3.retrieve_index_if_exists(maven_index_checker_data_dir)

        old_timestamp = 0
        try:
            old_timestamp = int(os.stat(timestamp_path).st_mtime)
        except OSError:
            self.log.info(
                'Timestamp is missing, we need to build the index from scratch.'
            )
            pass

        last_offset = s3.get_last_offset()
        self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses",
                                                "last_offset", last_offset))

        java_temp_dir = tempfile.mkdtemp(prefix='tmp-',
                                         dir=os.environ.get('PV_DIR', '/tmp'))

        cmd = [
            'java', '-Xmx768m', '-Djava.io.tmpdir={}'.format(java_temp_dir),
            '-DcentralIndexDir={}'.format(central_index_dir), '-jar',
            'maven-index-checker.jar', '-c'
        ]
        self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses",
                                                "cmd1", cmd))

        with cwd(maven_index_checker_dir):
            try:
                output = TimedCommand.get_command_output(cmd,
                                                         is_json=True,
                                                         graceful=False,
                                                         timeout=10800)
                self.log.info("{}__:__{}__:__{}".format(
                    "MavenReleasesAnalyses", "output", output))

                current_count = output['count']
                new_timestamp = int(os.stat(timestamp_path).st_mtime)
                if old_timestamp != new_timestamp:
                    self.log.info('Storing pre-built maven index to S3...')
                    s3.store_index(maven_index_checker_data_dir)
                    self.log.debug('Stored. Index in S3 is up-to-date.')
                    if old_timestamp == 0:
                        s3.set_last_offset(current_count)
                        self.log.info(
                            'This is first run, i.e. all packages are considered new. '
                            'Skipping scheduling to not analyze all packages in index.'
                        )
                        return
                else:
                    self.log.info('Index in S3 is up-to-date.')

                self.log.debug(
                    "Number of entries in maven indexer: %d, "
                    "last offset used: %d", current_count, last_offset)
                to_schedule_count = current_count - last_offset
                if to_schedule_count == 0:
                    self.log.info("No new packages to schedule, exiting...")
                    return

                cmd = [
                    'java', '-Xmx768m',
                    '-Djava.io.tmpdir={}'.format(java_temp_dir),
                    '-DcentralIndexDir={}'.format(central_index_dir), '-jar',
                    'maven-index-checker.jar', '-r',
                    '0-{}'.format(to_schedule_count)
                ]
                self.log.info("{}__:__{}__:__{}".format(
                    "MavenReleasesAnalyses", "cmd2", cmd))

                output = TimedCommand.get_command_output(cmd,
                                                         is_json=True,
                                                         graceful=False,
                                                         timeout=10800)
                self.log.info("{}__:__{}__:__{}".format(
                    "MavenReleasesAnalyses", "output", output))

            except TaskError as e:
                self.log.info("{}__:__{}__:__{}".format(
                    "MavenReleasesAnalyses", "TaskError", e))
                self.log.exception(e)
                raise
            finally:
                rmtree(central_index_dir)
                self.log.debug('central-index/ deleted')
                rmtree(java_temp_dir)

            self.log.info(
                "Found %d new packages to analyse, scheduling analyses...",
                len(output))
            for entry in output:
                self.log.info("{}__:__{}__:__{}".format(
                    "MavenReleasesAnalyses", "Running ingestion for", entry))
                self.run_selinon_flow(
                    'bayesianFlow', {
                        'ecosystem': 'maven',
                        'name': '{groupId}:{artifactId}'.format(**entry),
                        'version': entry['version'],
                        'recursive_limit': 0
                    })

        self.log.info("{}__:__{}__:__{}".format("MavenReleasesAnalyses",
                                                "current_count",
                                                current_count))
        s3.set_last_offset(current_count)
        self.log.info(
            "All new maven releases scheduled for analysis, exiting..")