Esempio n. 1
0
def analyses(app):
    """Prepare the known set of data used by tests."""
    e1 = Ecosystem(name='npm', backend=EcosystemBackend.npm)
    p1 = Package(ecosystem=e1, name='arrify')
    v1 = Version(package=p1, identifier='1.0.1')
    model1 = Analysis(version=v1, started_at=now, finished_at=later)
    app.rdb.session.add(model1)

    e2 = Ecosystem(name='pypi', backend=EcosystemBackend.pypi)
    p2 = Package(ecosystem=e2, name='flexmock')
    v2 = Version(package=p2, identifier='0.10.1')
    model2 = Analysis(version=v2, started_at=later, access_count=1)
    app.rdb.session.add(model2)
    app.rdb.session.commit()

    worker_results2 = {'a': 'b', 'c': 'd', 'e': 'f', 'g': 'h', 'i': 'j',
                       'digests': {'details':
                                   [{'artifact': True,
                                     'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}}
    for w, tr in worker_results2.items():
        app.rdb.session.add(WorkerResult(analysis_id=model2.id, worker=w, task_result=tr))

    model3 = Analysis(version=v2, started_at=later, access_count=1,
                      audit={'audit': {'audit': 'audit', 'e': 'f', 'g': 'h'}, 'a': 'b', 'c': 'd'})
    app.rdb.session.add(model3)
    app.rdb.session.commit()
    worker_results3 = {'digests': {'details':
                                   [{'artifact': True,
                                     'sha1': '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}}
    for w, tr in worker_results3.items():
        app.rdb.session.add(WorkerResult(analysis_id=model3.id, worker=w, task_result=tr))
    app.rdb.session.commit()
    return (model1, model2, model3)
def fill_analyses(app):
    """Prepare static data used by unit tests."""
    # TODO can not find any usage of this function
    ecosystems = [
        Ecosystem(name='pypi', backend=EcosystemBackend.pypi, url='https://pypi.python.org/',
                  fetch_url='https://pypi.python.org/pypi'),
        Ecosystem(name='npm', backend=EcosystemBackend.npm, url='https://www.npmjs.com/',
                  fetch_url='https://registry.npmjs.org/'),
        Ecosystem(name='go', backend=EcosystemBackend.scm),
    ]

    packages = [
        Package(name='flexmock', ecosystem=ecosystems[0]),
        Package(name='requests', ecosystem=ecosystems[0]),
        Package(name='sequence', ecosystem=ecosystems[1]),
        Package(name='arrify', ecosystem=ecosystems[1]),
        Package(name='serve-static', ecosystem=ecosystems[1]),
    ]

    versions = [
        Version(identifier='0.10.1', package=packages[0]),
        Version(identifier='0.9.1', package=packages[0]),
        Version(identifier='2.0.0', package=packages[1]),
        Version(identifier='2.2.1', package=packages[2]),
        Version(identifier='1.0.1', package=packages[3]),
        Version(identifier='1.7.1', package=packages[4]),
    ]

    analyses = [
        Analysis(version=versions[0], started_at=now),                     # pypi/flexmock/0.10.1
        Analysis(version=versions[0], started_at=later, access_count=1),   # pypi/flexmock/0.10.1
        Analysis(version=versions[1], started_at=even_later),              # pypi/flexmock/0.9.1
        Analysis(version=versions[2], started_at=now),                     # pypi/requests/2.0.0
        Analysis(version=versions[3], started_at=later),                   # npm/sequence/2.2.1
        Analysis(version=versions[4], started_at=now, finished_at=later),  # npm/arrify/1.0.1
        Analysis(version=versions[5], started_at=now, finished_at=later,
                 release='npm:serve-static:1.7.1'),                      # npm/serve-static/1.7.1
    ]
    # worker results that correspond to analyses above
    worker_results = [
        WorkerResult(worker='digests', analysis=analyses[1],
                     task_result={'details': [{'artifact': True,
                                               'sha1':
                                               '6be7ae55bae2372c7be490321bbe5ead278bb51b'}]}),
        WorkerResult(worker='static_analysis', task_result={'details': []}, analysis=analyses[1]),
        WorkerResult(worker='source_licenses',
                     task_result={'schema': {'name': 'source_licenses', 'version': '1-0-0'}},
                     analysis=analyses[1])
    ]

    # TODO: just a placeholder, it won't work in real tests!!!
    package_gh_usage = None

    for a in ecosystems + packages + versions + analyses + worker_results + package_gh_usage:
        app.rdb.session.add(a)
        app.rdb.session.commit()

    return (ecosystems, packages, versions, analyses, worker_results, package_gh_usage)
Esempio n. 3
0
def db_results():
    """Mimic SQLAlchemy query result."""
    ecosystem = Ecosystem()
    ecosystem.name = 'maven'
    package = Package()
    package.ecosystem = ecosystem
    package.name = 'net.iharder:base64'
    upstream = Upstream()
    upstream.url = 'https://github.com/omalley/base64'
    upstream.package = package

    return [upstream]
Esempio n. 4
0
    def retrieve_bookkeeping_for_ecosystem_package(self, ecosystem, package):
        """Retrieve BookKeeping data for given Package and Ecosystem.

        :param ecosystem: ecosystem for which the data should be retrieved
        :param package: package for which the data should be retrieved
        """
        e = Ecosystem.by_name(self.db, ecosystem)
        p = Package.by_name(self.db, package)

        stat = self.db.query(PackageWorkerResult).\
            join(PackageAnalysis).\
            filter(PackageAnalysis.package == p)
        worker_stats = []
        for package_worker_result in stat.all():
            entry = {"worker_name": package_worker_result.worker,
                     "has_error": package_worker_result.error,
                     "task_result": package_worker_result.task_result,
                     "started_at": package_worker_result.started_at,
                     "ended_at": package_worker_result.ended_at}
            worker_stats.append(entry)

        version_count = self.db.query(Version).join(Package).\
            filter(Package.ecosystem == e).\
            filter(Version.package == p).count()
        p_versions = self.db.query(Version).join(Package).join(Ecosystem).\
            filter(Package.ecosystem == e).\
            filter(Version.package == p)

        return {"ecosystem": e.name,
                "package": p.name,
                "package_version_count": version_count,
                "package_level_workers": worker_stats,
                "analysed_versions": [v.identifier for v in p_versions]}
Esempio n. 5
0
    def retrieve_bookkeeping_for_epv(self, ecosystem, package, version):
        """Retrieve BookKeeping data for the given ecosystem, package, and version.

        :param ecosystem: ecosystem for which the data should be retrieved
        :param package: package for which the data should be retrieved
        :param version: package version for which the data should be retrieved
        """
        e = Ecosystem.by_name(self.db, ecosystem)
        p = Package.by_name(self.db, package)
        v = self.db.query(Version).join(Package).join(Ecosystem). \
            filter(Package.ecosystem == e). \
            filter(Version.package == p). \
            filter(Version.identifier == version).one()

        stat = self.db.query(WorkerResult).\
            join(Analysis).join(Version).\
            filter(Analysis.version == v)
        worker_stats = []
        for worker_result in stat.all():
            entry = {"worker_name": worker_result.worker,
                     "has_error": worker_result.error,
                     "task_result": worker_result.task_result,
                     "started_at": worker_result.started_at,
                     "ended_at": worker_result.ended_at}
            worker_stats.append(entry)

        return {"ecosystem": e.name,
                "package": p.name,
                "version": v.identifier,
                "workers": worker_stats}
def fill_packages_for_paging(app, request):
    e = Ecosystem(name='pypi', backend=EcosystemBackend.pypi)
    app.rdb.session.add(e)
    for p in range(0, 11):
        app.rdb.session.add(Package(ecosystem=e, name=str(p)))

    app.rdb.session.commit()
def fill_packages_for_paging(app, request):
    """Create and store set of packages used by unit tests."""
    e = Ecosystem(name='pypi', backend=EcosystemBackend.pypi)
    app.rdb.session.add(e)
    for p in range(0, 11):
        app.rdb.session.add(Package(ecosystem=e, name=str(p)))

    app.rdb.session.commit()
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('ecosystem'))

        # get rid of version if scheduled from the core analyses
        arguments.pop('version', None)
        arguments.pop('document_id', None)

        db = self.storage.session
        try:
            ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        except NoResultFound:
            raise FatalTaskError('Unknown ecosystem: %r' %
                                 arguments['ecosystem'])
        package = Package.get_or_create(db,
                                        ecosystem_id=ecosystem.id,
                                        name=arguments['name'])
        url = self.get_upstream_url(arguments)
        upstream = self.get_upstream_entry(package, url)
        if upstream is None:
            upstream = self.add_or_update_upstream(package, url)
        arguments['url'] = upstream.url

        if not arguments.get('force'):
            # can potentially schedule two flows of a same type at the same
            # time as there is no lock, but let's say it's OK
            if upstream.updated_at is not None \
                    and datetime.datetime.utcnow() - upstream.updated_at < self._UPDATE_INTERVAL:
                self.log.info(
                    'Skipping upstream package check as data are considered as recent - '
                    'last update %s.', upstream.updated_at)
                # keep track of start, but do not schedule nothing more
                # discard changes like updates
                db.rollback()
                return arguments

        # if this fails, it's actually OK, as there could be concurrency
        package_analysis = PackageAnalysis(
            package_id=package.id,
            started_at=datetime.datetime.utcnow(),
            finished_at=None)
        db.add(package_analysis)

        # keep track of updates
        upstream.updated_at = datetime.datetime.utcnow()

        db.commit()
        arguments['document_id'] = package_analysis.id
        return arguments
def retrieve_bookkeeping_for_ecosystem_package(ecosystem, package):
    """Retrieve BookKeeping data for given Package and Ecosystem.

    :param ecosystem: ecosystem for which the data should be retrieved
    :param package: package for which the data should be retrieved
    """
    rdb = StoragePool.get_connected_storage('BayesianPostgres')
    db = rdb.session

    try:
        e = Ecosystem.by_name(db, ecosystem)
        p = Package.by_name(db, package)

        version_count = _count(
            db,
            db.query(Version).join(Package).filter(
                Package.ecosystem == e).filter(Version.package == p))

        stat = db.query(PackageWorkerResult.worker, PackageWorkerResult.error,
                        PackageWorkerResult.task_result).join(PackageAnalysis). \
            filter(PackageAnalysis.package == p). \
            all()

        worker_stats = []
        for worker_name, has_error, task_result in stat:
            entry = {
                "worker_name": worker_name,
                "has_error": has_error,
                "task_result": task_result
            }
            worker_stats.append(entry)

        p_versions = db.query(Version).join(Package).join(Ecosystem). \
            filter(Package.ecosystem == e). \
            filter(Version.package == p)

        result = {
            "summary": {
                "ecosystem": e.name,
                "package": p.name,
                "package_version_count": version_count,
                "package_level_workers": worker_stats,
                "analysed_versions": [v.identifier for v in p_versions]
            }
        }
    except NoResultFound as e:
        result = {"error": "No such package: %s/%s" % (ecosystem, package)}
    except SQLAlchemyError as e:
        result = {
            "error":
            "Error encountered while fetching data. Please check logs."
        }
    return result
def retrieve_bookkeeping_for_epv(ecosystem, package, version):
    """Retrieve BookKeeping data for the given ecosystem, package, and version.

    :param ecosystem: ecosystem for which the data should be retrieved
    :param package: package for which the data should be retrieved
    :param version: package version for which the data should be retrieved
    """
    rdb = StoragePool.get_connected_storage('BayesianPostgres')
    db = rdb.session
    try:
        e = Ecosystem.by_name(db, ecosystem)
        p = Package.by_name(db, package)
        v = db.query(Version).join(Package).join(Ecosystem). \
            filter(Package.ecosystem == e). \
            filter(Version.package == p). \
            filter(Version.identifier == version).one()

        stat = db.query(WorkerResult.worker, WorkerResult.error, WorkerResult.task_result). \
            join(Analysis).join(Version). \
            filter(Analysis.version == v).all()

        worker_stats = []
        for worker_name, has_error, task_result in stat:
            entry = {
                "worker_name": worker_name,
                "has_error": has_error,
                "task_result": task_result
            }
            worker_stats.append(entry)

        result = {
            "summary": {
                "ecosystem": e.name,
                "package": p.name,
                "version": v.identifier,
                "workers": worker_stats
            }
        }
    except NoResultFound as e:
        return {
            "error":
            "No such version: %s/%s/%s" % (ecosystem, package, version)
        }
    except SQLAlchemyError as e:
        result = {
            "error":
            "Error encountered while fetching data. Please check logs."
        }
    return result
    def setup_method(self, method):
        rdb()
        self.s = create_db_scoped_session()
        self.en = 'foo'
        self.pn = 'bar'
        self.vi = '1.1.1'
        self.e = Ecosystem(name=self.en, backend=EcosystemBackend.maven)
        self.p = Package(ecosystem=self.e, name=self.pn)
        self.v = Version(package=self.p, identifier=self.vi)
        self.a = Analysis(version=self.v, finished_at=datetime.datetime.now())
        self.a2 = Analysis(version=self.v,
                           finished_at=datetime.datetime.now() +
                           datetime.timedelta(seconds=10))
        self.s.add(self.a)
        self.s.add(self.a2)
        self.s.commit()

        self.bp = BayesianPostgres(
            connection_string=configuration.POSTGRES_CONNECTION)
    def execute(self, arguments):
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('ecosystem'))

        # get rid of version if scheduled from the core analyses
        arguments.pop('version', None)

        db = self.storage.session
        ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        package = Package.get_or_create(db,
                                        ecosystem_id=ecosystem.id,
                                        name=arguments['name'])
        upstream = self.get_upstream_entry(db, package,
                                           self.get_upstream_url(arguments))
        arguments['url'] = upstream.url

        if not arguments.get('force'):
            # can potentially schedule two flows of a same type at the same
            # time as there is no lock, but let's say it's OK
            if upstream.updated_at is not None \
                    and upstream.updated_at - datetime.datetime.now() < self._UPDATE_INTERVAL:
                self.log.info(
                    'Skipping upstream package check as data are considered as recent - '
                    'last update %s.', upstream.updated_at)
                # keep track of start, but do not schedule nothing more
                # discard changes like updates
                db.rollback()
                return arguments

        # if this fails, it's actually OK, as there could be concurrency
        package_analysis = PackageAnalysis(package_id=package.id,
                                           started_at=datetime.datetime.now(),
                                           finished_at=None)
        db.add(package_analysis)

        # keep track of updates
        upstream.updated_at = datetime.datetime.now()

        db.commit()
        arguments['document_id'] = package_analysis.id
        return arguments
    def test_f8a_fetcher(self, rdb, npm):
        """Test F8aReleasesFetcher."""
        # create initial dataset
        package = Package(ecosystem=npm, name='f8a')
        rdb.add(package)
        rdb.commit()
        versions = {
            '0.5.0', '0.5.1', '0.6.0', '0.6.4', '0.7.0', '0.8.0', '0.9.0',
            '1.0.0', '1.0.5'
        }
        for v in versions:
            version = Version(package=package, identifier=v)
            rdb.add(version)
            rdb.commit()
            analysis = Analysis(version=version)
            # Fetcher only selects finished analyses
            analysis.finished_at = datetime.datetime.utcnow()
            rdb.add(analysis)
            rdb.commit()

        f = F8aReleasesFetcher(npm, rdb)

        r = f.fetch_releases('f8a')[1]

        # make sure we fetched the same stuff we inserted
        assert set(r) == versions

        # first should be the latest
        assert r.pop() == '1.0.5'

        # try different dependency specs
        s = get_ecosystem_solver(npm, with_fetcher=f)
        assert s.solve(['f8a ^0.5.0'])['f8a'] == '0.5.1'
        assert s.solve(['f8a 0.x.x'])['f8a'] == '0.9.0'
        assert s.solve(['f8a >1.0.0'])['f8a'] == '1.0.5'
        assert s.solve(['f8a ~>0.6.0'])['f8a'] == '0.6.4'

        # check that with `all_versions` we return all the relevant ones
        assert set(s.solve(['f8a >=0.6.0'], all_versions=True)['f8a']) == \
            (versions - {'0.5.0', '0.5.1'})
Esempio n. 14
0
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self.log.debug("Input Arguments: {}".format(arguments))
        self._strict_assert(arguments.get('name'))
        self._strict_assert(arguments.get('version'))
        self._strict_assert(arguments.get('ecosystem'))

        # make sure we store package name based on ecosystem package naming case sensitivity
        arguments['name'] = normalize_package_name(arguments['ecosystem'], arguments['name'])

        db = self.storage.session
        try:
            ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        except NoResultFound:
            raise FatalTaskError('Unknown ecosystem: %r' % arguments['ecosystem'])

        p = Package.get_or_create(db, ecosystem_id=ecosystem.id, name=arguments['name'])
        v = Version.get_or_create(db, package_id=p.id, identifier=arguments['version'])

        if not arguments.get('force'):
            # TODO: this is OK for now, but if we will scale and there will be
            # 2+ workers running this task they can potentially schedule two
            # flows of a same type at the same time
            if db.query(Analysis).filter(Analysis.version_id == v.id).count() > 0:
                # we need to propagate flags that were passed to flow, but not
                # E/P/V - this way we are sure that for example graph import is
                # scheduled (arguments['force_graph_sync'] == True)
                arguments.pop('name')
                arguments.pop('version')
                arguments.pop('ecosystem')
                self.log.debug("Arguments returned by initAnalysisFlow without force: {}"
                               .format(arguments))
                return arguments

        cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR)
        epv_cache = ObjectCache.get_from_dict(arguments)

        try:
            if not epv_cache.\
                    has_source_tarball():
                _, source_tarball_path = IndianaJones.fetch_artifact(
                    ecosystem=ecosystem,
                    artifact=arguments['name'],
                    version=arguments['version'],
                    target_dir=cache_path
                )
                epv_cache.put_source_tarball(source_tarball_path)

            if ecosystem.is_backed_by(EcosystemBackend.maven):
                if not epv_cache.has_source_jar():
                    try:
                        source_jar_path = self._download_source_jar(cache_path, ecosystem,
                                                                    arguments)
                        epv_cache.put_source_jar(source_jar_path)
                    except Exception as e:
                        self.log.info(
                            'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'.
                            format(n=arguments.get('name'),
                                   v=arguments.get('version'),
                                   err=str(e))
                        )

                if not epv_cache.has_pom_xml():
                    pom_xml_path = self._download_pom_xml(cache_path, ecosystem, arguments)
                    epv_cache.put_pom_xml(pom_xml_path)
        finally:
            # always clean up cache
            shutil.rmtree(cache_path)

        a = Analysis(version=v, access_count=1, started_at=datetime.datetime.utcnow())
        db.add(a)
        db.commit()

        arguments['document_id'] = a.id

        # export ecosystem backend so we can use it to easily control flow later
        arguments['ecosystem_backend'] = ecosystem.backend.name

        self.log.debug("Arguments returned by InitAnalysisFlow are: {}".format(arguments))
        return arguments
    def execute(self, arguments):
        """Task code.

        :param arguments: dictionary with task arguments
        :return: {}, results
        """
        self.log.debug("Input Arguments: {}".format(arguments))
        self._strict_assert(isinstance(arguments.get('ecosystem'), str))
        self._strict_assert(isinstance(arguments.get('name'), str))
        self._strict_assert(isinstance(arguments.get('version'), str))

        db = self.storage.session
        try:
            ecosystem = Ecosystem.by_name(db, arguments['ecosystem'])
        except NoResultFound:
            raise FatalTaskError('Unknown ecosystem: %r' %
                                 arguments['ecosystem'])

        # make sure we store package name in its normalized form
        arguments['name'] = normalize_package_name(ecosystem.backend.name,
                                                   arguments['name'])

        if len(pattern_ignore.findall(arguments['version'])) > 0:
            self.log.info("Incorrect version alert {} {}".format(
                arguments['name'], arguments['version']))
            raise NotABugFatalTaskError("Incorrect version alert {} {}".format(
                arguments['name'], arguments['version']))

        # Dont try ingestion for private packages
        if is_pkg_public(arguments['ecosystem'], arguments['name']):
            self.log.info("Ingestion flow for {} {}".format(
                arguments['ecosystem'], arguments['name']))
        else:
            self.log.info("Private package ingestion ignored {} {}".format(
                arguments['ecosystem'], arguments['name']))
            raise NotABugFatalTaskError("Private package alert {} {}".format(
                arguments['ecosystem'], arguments['name']))

        p = Package.get_or_create(db,
                                  ecosystem_id=ecosystem.id,
                                  name=arguments['name'])
        v = Version.get_or_create(db,
                                  package_id=p.id,
                                  identifier=arguments['version'])

        if not arguments.get('force'):
            if db.query(Analysis).filter(
                    Analysis.version_id == v.id).count() > 0:
                arguments['analysis_already_exists'] = True
                self.log.debug(
                    "Arguments returned by initAnalysisFlow without force: {}".
                    format(arguments))
                return arguments

        cache_path = mkdtemp(dir=self.configuration.WORKER_DATA_DIR)
        epv_cache = ObjectCache.get_from_dict(arguments)
        npm_dir = self.configuration.NPM_DATA_DIR

        try:
            if not epv_cache.\
                    has_source_tarball():
                _, source_tarball_path = IndianaJones.fetch_artifact(
                    ecosystem=ecosystem,
                    artifact=arguments['name'],
                    version=arguments['version'],
                    target_dir=cache_path)
                epv_cache.put_source_tarball(source_tarball_path)

            if ecosystem.is_backed_by(EcosystemBackend.maven):
                if not epv_cache.has_source_jar():
                    try:
                        source_jar_path = self._download_source_jar(
                            cache_path, ecosystem, arguments)
                        epv_cache.put_source_jar(source_jar_path)
                    except Exception as e:
                        self.log.info(
                            'Failed to fetch source jar for maven artifact "{n}/{v}": {err}'
                            .format(n=arguments.get('name'),
                                    v=arguments.get('version'),
                                    err=str(e)))

                if not epv_cache.has_pom_xml():
                    pom_xml_path = self._download_pom_xml(
                        cache_path, ecosystem, arguments)
                    epv_cache.put_pom_xml(pom_xml_path)
        finally:
            # always clean up cache
            shutil.rmtree(cache_path)
            if arguments['ecosystem'] == "npm":
                shutil.rmtree(npm_dir, True)

        a = Analysis(version=v,
                     access_count=1,
                     started_at=datetime.datetime.utcnow())
        db.add(a)
        db.commit()

        arguments['document_id'] = a.id

        # export ecosystem backend so we can use it to easily control flow later
        arguments['ecosystem_backend'] = ecosystem.backend.name

        self.log.debug(
            "Arguments returned by InitAnalysisFlow are: {}".format(arguments))
        return arguments