Python TaskService Exemples, edx.analytics.tasks.tests.acceptance.services.task.TaskService Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : __init__.py Projet : open-craft/edx-analytics-pipeline

    def setUp(self):
        self.s3_client = S3Client()

        config_json = os.getenv('ACCEPTANCE_TEST_CONFIG')
        try:
            with open(config_json, 'r') as config_json_file:
                self.config = json.load(config_json_file)
        except (IOError, TypeError):
            try:
                self.config = json.loads(config_json)
            except TypeError:
                self.config = {}

        # The name of an existing job flow to run the test on
        assert ('job_flow_name' in self.config)
        # The git URL of the pipeline repository to check this code out from.
        assert ('tasks_repo' in self.config)
        # The branch of the pipeline repository to test. Note this can differ from the branch that is currently
        # checked out and running this code.
        assert ('tasks_branch' in self.config)
        # Where to store logs generated by the pipeline
        assert ('tasks_log_path' in self.config)
        # The user to connect to the job flow over SSH with.
        assert ('connection_user' in self.config)
        # Where the pipeline should output data, should be a URL pointing to a directory.
        assert ('tasks_output_url' in self.config)
        # Allow for parallel execution of the test by specifying a different identifier. Using an identical identifier
        # allows for old virtualenvs to be reused etc, which is why a random one is not simply generated with each run.
        assert ('identifier' in self.config)
        # A URL to a JSON file that contains most of the connection information for the MySQL database.
        assert ('credentials_file_url' in self.config)
        # A URL to a JSON file that contains most of the connection information for the Veritca database.
        assert ('vertica_creds_url' in self.config)
        # A URL to a build of the oddjob third party library
        assert 'oddjob_jar' in self.config
        # A URL to a maxmind compatible geolocation database file
        assert 'geolocation_data' in self.config

        self.data_dir = os.path.join(os.path.dirname(__file__), 'fixtures')

        url = self.config['tasks_output_url']
        m = hashlib.md5()
        m.update(self.config['identifier'])
        self.identifier = m.hexdigest()
        self.test_root = url_path_join(url, self.identifier,
                                       self.__class__.__name__)

        self.test_src = url_path_join(self.test_root, 'src')
        self.test_out = url_path_join(self.test_root, 'out')

        self.catalog_path = 'http://acceptance.test/api/courses/v2'
        database_name = 'test_' + self.identifier
        schema = 'test_' + self.identifier
        import_database_name = 'import_' + database_name
        export_database_name = 'export_' + database_name
        self.warehouse_path = url_path_join(self.test_root, 'warehouse')
        task_config_override = {
            'hive': {
                'database': database_name,
                'warehouse_path': self.warehouse_path
            },
            'map-reduce': {
                'marker': url_path_join(self.test_root, 'marker')
            },
            'manifest': {
                'path': url_path_join(self.test_root, 'manifest'),
                'lib_jar': self.config['oddjob_jar']
            },
            'database-import': {
                'credentials': self.config['credentials_file_url'],
                'destination': self.warehouse_path,
                'database': import_database_name
            },
            'database-export': {
                'credentials': self.config['credentials_file_url'],
                'database': export_database_name
            },
            'vertica-export': {
                'credentials': self.config['vertica_creds_url'],
                'schema': schema
            },
            'course-catalog': {
                'catalog_path': self.catalog_path
            },
            'geolocation': {
                'geolocation_data': self.config['geolocation_data']
            },
            'event-logs': {
                'source': self.test_src
            },
            'course-structure': {
                'api_root_url': 'acceptance.test',
                'access_token': 'acceptance'
            }
        }

        log.info('Running test: %s', self.id())
        log.info('Using executor: %s', self.config['identifier'])
        log.info('Generated Test Identifier: %s', self.identifier)

        self.import_db = db.DatabaseService(self.config, import_database_name)
        self.export_db = db.DatabaseService(self.config, export_database_name)
        self.task = task.TaskService(self.config, task_config_override,
                                     self.identifier)
        self.vertica = vertica.VerticaService(self.config, schema)
        self.hive = hive.HiveService(self.task, self.config, database_name)

        self.reset_external_state()

Exemple #2

0

Afficher le fichier

    def setUp(self):
        try:
            self.s3_client = ScalableS3Client()
        except Exception:
            self.s3_client = None

        self.config = get_test_config()

        for env_var in ('TASKS_REPO', 'TASKS_BRANCH', 'IDENTIFIER',
                        'JOB_FLOW_NAME', 'IS_REMOTE'):
            if env_var in os.environ:
                self.config[env_var.lower()] = os.environ[env_var]

        if 'is_remote' in self.config:
            self.config['is_remote'] = self.config['is_remote'].lower(
            ) not in ('0', 'false', 'f')
        else:
            self.config['is_remote'] = True

        if self.config['is_remote']:
            # The name of an existing job flow to run the test on
            assert ('job_flow_name' in self.config or 'host' in self.config)
            # The git URL of the pipeline repository to check this code out from.
            assert ('tasks_repo' in self.config)
            # The branch of the pipeline repository to test. Note this can differ from the branch that is currently
            # checked out and running this code.
            assert ('tasks_branch' in self.config)
            # Where to store logs generated by the pipeline
            assert ('tasks_log_path' in self.config)
            # The user to connect to the job flow over SSH with.
            assert ('connection_user' in self.config)

        # Where the pipeline should output data, should be a URL pointing to a directory.
        assert ('tasks_output_url' in self.config)
        # Allow for parallel execution of the test by specifying a different identifier. Using an identical identifier
        # allows for old virtualenvs to be reused etc, which is why a random one is not simply generated with each run.
        assert ('identifier' in self.config)
        # A URL to a JSON file that contains most of the connection information for the MySQL database.
        assert ('credentials_file_url' in self.config)
        # A URL to a build of the oddjob third party library
        assert 'oddjob_jar' in self.config
        # A URL to a maxmind compatible geolocation database file
        assert 'geolocation_data' in self.config

        self.data_dir = os.path.join(os.path.dirname(__file__), 'fixtures')

        url = self.config['tasks_output_url']
        m = hashlib.md5()
        m.update(self.config['identifier'])
        self.identifier = m.hexdigest()
        self.test_root = url_path_join(url, self.identifier,
                                       self.__class__.__name__)

        self.test_src = url_path_join(self.test_root, 'src')
        self.test_out = url_path_join(self.test_root, 'out')

        # Use a local dir for devstack testing, or s3 for production testing.
        self.report_output_root = self.config.get(
            'report_output_root', url_path_join(self.test_out, 'reports'))

        self.catalog_path = 'http://acceptance.test/api/courses/v2'
        database_name = 'test_' + self.identifier
        schema = 'test_' + self.identifier
        import_database_name = 'acceptance_import_' + database_name
        export_database_name = 'acceptance_export_' + database_name
        otto_database_name = 'acceptance_otto_' + database_name
        elasticsearch_alias = 'alias_test_' + self.identifier
        self.warehouse_path = url_path_join(self.test_root, 'warehouse')
        self.edx_rest_api_cache_root = url_path_join(self.test_src,
                                                     'edx-rest-api-cache')
        task_config_override = {
            'hive': {
                'database': database_name,
                'warehouse_path': self.warehouse_path
            },
            'map-reduce': {
                'marker': url_path_join(self.test_root, 'marker')
            },
            'manifest': {
                'path': url_path_join(self.test_root, 'manifest'),
                'lib_jar': self.config['oddjob_jar'],
            },
            'database-import': {
                'credentials': self.config['credentials_file_url'],
                'destination': self.warehouse_path,
                'database': import_database_name
            },
            'database-export': {
                'credentials': self.config['credentials_file_url'],
                'database': export_database_name
            },
            'otto-database-import': {
                'credentials': self.config['credentials_file_url'],
                'database': otto_database_name
            },
            'course-catalog': {
                'catalog_path': self.catalog_path
            },
            'geolocation': {
                'geolocation_data': self.config['geolocation_data']
            },
            'event-logs': {
                'source':
                as_list_param(self.test_src, escape_quotes=False),
                'pattern':
                as_list_param(".*tracking.log-(?P<date>\\d{8}).*\\.gz",
                              escape_quotes=False),
            },
            'segment-logs': {
                'source':
                as_list_param(self.test_src, escape_quotes=False),
                'pattern':
                as_list_param(".*segment.log-(?P<date>\\d{8}).*\\.gz",
                              escape_quotes=False),
            },
            'course-structure': {
                'api_root_url': 'acceptance.test',
                'access_token': 'acceptance'
            },
            'module-engagement': {
                'alias': elasticsearch_alias
            },
            'elasticsearch': {},
            'problem-response': {
                'report_fields':
                '["username","problem_id","answer_id","location","question","score","max_score",'
                '"correct","answer","total_attempts","first_attempt_date","last_attempt_date"]',
                'report_field_list_delimiter':
                '"|"',
                'report_field_datetime_format':
                '%Y-%m-%dT%H:%M:%SZ',
                'report_output_root':
                self.report_output_root,
                'partition_format':
                '%Y-%m-%dT%H',
            },
            'edx-rest-api': {
                'client_id': 'oauth_id',
                'client_secret': 'oauth_secret',
                'oauth_username': '******',
                'oauth_password': '******',
                'auth_url': 'http://acceptance.test',
            },
            'course-blocks': {
                'api_root_url':
                'http://acceptance.test/api/courses/v1/blocks/',
            },
            'course-list': {
                'api_root_url':
                'http://acceptance.test/api/courses/v1/courses/',
            },
        }

        if 'elasticsearch_host' in self.config:
            task_config_override['elasticsearch']['host'] = as_list_param(
                self.config['elasticsearch_host'], escape_quotes=False)
        if 'elasticsearch_connection_class' in self.config:
            task_config_override['elasticsearch'][
                'connection_type'] = self.config[
                    'elasticsearch_connection_class']
        if 'manifest_input_format' in self.config:
            task_config_override['manifest']['input_format'] = self.config[
                'manifest_input_format']
        if 'hive_version' in self.config:
            task_config_override['hive']['version'] = self.config[
                'hive_version']

        log.info('Running test: %s', self.id())
        log.info('Using executor: %s', self.config['identifier'])
        log.info('Generated Test Identifier: %s', self.identifier)

        self.import_db = db.DatabaseService(self.config, import_database_name)
        self.export_db = db.DatabaseService(self.config, export_database_name)
        self.otto_db = db.DatabaseService(self.config, otto_database_name)
        self.task = task.TaskService(self.config, task_config_override,
                                     self.identifier)
        self.hive = hive.HiveService(self.task, self.config, database_name)
        self.elasticsearch = elasticsearch_service.ElasticsearchService(
            self.config, elasticsearch_alias)

        self.reset_external_state()

        max_diff = os.getenv('MAX_DIFF', None)
        if max_diff is not None:
            if max_diff.lower() == "infinite":
                self.maxDiff = None
            else:
                self.maxDiff = int(max_diff)

Exemple #3

0

Afficher le fichier

Fichier : __init__.py Projet : npoed/edx-analytics-pipeline

    def setUp(self):
        try:
            self.s3_client = S3Client()
        except Exception:
            self.s3_client = None

        self.config = get_test_config()

        for env_var in ('TASKS_REPO', 'TASKS_BRANCH', 'IDENTIFIER',
                        'JOB_FLOW_NAME'):
            if env_var in os.environ:
                self.config[env_var.lower()] = os.environ[env_var]

        # The name of an existing job flow to run the test on
        assert ('job_flow_name' in self.config or 'host' in self.config)
        # The git URL of the pipeline repository to check this code out from.
        assert ('tasks_repo' in self.config)
        # The branch of the pipeline repository to test. Note this can differ from the branch that is currently
        # checked out and running this code.
        assert ('tasks_branch' in self.config)
        # Where to store logs generated by the pipeline
        assert ('tasks_log_path' in self.config)
        # The user to connect to the job flow over SSH with.
        assert ('connection_user' in self.config)
        # Where the pipeline should output data, should be a URL pointing to a directory.
        assert ('tasks_output_url' in self.config)
        # Allow for parallel execution of the test by specifying a different identifier. Using an identical identifier
        # allows for old virtualenvs to be reused etc, which is why a random one is not simply generated with each run.
        assert ('identifier' in self.config)
        # A URL to a JSON file that contains most of the connection information for the MySQL database.
        assert ('credentials_file_url' in self.config)
        # A URL to a build of the oddjob third party library
        assert 'oddjob_jar' in self.config
        # A URL to a maxmind compatible geolocation database file
        assert 'geolocation_data' in self.config

        self.data_dir = os.path.join(os.path.dirname(__file__), 'fixtures')

        url = self.config['tasks_output_url']
        m = hashlib.md5()
        m.update(self.config['identifier'])
        self.identifier = m.hexdigest()
        self.test_root = url_path_join(url, self.identifier,
                                       self.__class__.__name__)

        self.test_src = url_path_join(self.test_root, 'src')
        self.test_out = url_path_join(self.test_root, 'out')

        self.catalog_path = 'http://acceptance.test/api/courses/v2'
        database_name = 'test_' + self.identifier
        schema = 'test_' + self.identifier
        import_database_name = 'acceptance_import_' + database_name
        export_database_name = 'acceptance_export_' + database_name
        otto_database_name = 'acceptance_otto_' + database_name
        elasticsearch_alias = 'alias_test_' + self.identifier
        self.warehouse_path = url_path_join(self.test_root, 'warehouse')
        task_config_override = {
            'hive': {
                'database': database_name,
                'warehouse_path': self.warehouse_path
            },
            'map-reduce': {
                'marker': url_path_join(self.test_root, 'marker')
            },
            'manifest': {
                'path': url_path_join(self.test_root, 'manifest'),
                'lib_jar': self.config['oddjob_jar']
            },
            'database-import': {
                'credentials': self.config['credentials_file_url'],
                'destination': self.warehouse_path,
                'database': import_database_name
            },
            'database-export': {
                'credentials': self.config['credentials_file_url'],
                'database': export_database_name
            },
            'otto-database-import': {
                'credentials': self.config['credentials_file_url'],
                'database': otto_database_name
            },
            'course-catalog': {
                'catalog_path': self.catalog_path
            },
            'geolocation': {
                'geolocation_data': self.config['geolocation_data']
            },
            'event-logs': {
                'source': self.test_src
            },
            'course-structure': {
                'api_root_url': 'acceptance.test',
                'access_token': 'acceptance'
            },
            'module-engagement': {
                'alias': elasticsearch_alias
            },
            'elasticsearch': {}
        }
        if 'vertica_creds_url' in self.config:
            task_config_override['vertica-export'] = {
                'credentials': self.config['vertica_creds_url'],
                'schema': schema
            }
        if 'elasticsearch_host' in self.config:
            task_config_override['elasticsearch']['host'] = self.config[
                'elasticsearch_host']
        if 'elasticsearch_connection_class' in self.config:
            task_config_override['elasticsearch'][
                'connection_type'] = self.config[
                    'elasticsearch_connection_class']
        if 'manifest_input_format' in self.config:
            task_config_override['manifest']['input_format'] = self.config[
                'manifest_input_format']
        if 'hive_version' in self.config:
            task_config_override['hive']['version'] = self.config[
                'hive_version']

        log.info('Running test: %s', self.id())
        log.info('Using executor: %s', self.config['identifier'])
        log.info('Generated Test Identifier: %s', self.identifier)

        self.import_db = db.DatabaseService(self.config, import_database_name)
        self.export_db = db.DatabaseService(self.config, export_database_name)
        self.otto_db = db.DatabaseService(self.config, otto_database_name)
        self.task = task.TaskService(self.config, task_config_override,
                                     self.identifier)
        self.hive = hive.HiveService(self.task, self.config, database_name)
        self.vertica = vertica.VerticaService(self.config, schema)
        self.elasticsearch = elasticsearch_service.ElasticsearchService(
            self.config, elasticsearch_alias)

        self.reset_external_state()

        max_diff = os.getenv('MAX_DIFF', None)
        if max_diff is not None:
            if max_diff.lower() == "infinite":
                self.maxDiff = None
            else:
                self.maxDiff = int(max_diff)