def generate_file_list(self): """Yield each individual path given a source folder and a set of file-matching expressions.""" for src in self.src: if src.startswith('s3'): # connect lazily as needed: if self.s3_conn is None: self.s3_conn = ScalableS3Client().s3 for _bucket, _root, path in generate_s3_sources(self.s3_conn, src, self.include, self.include_zero_length): source = url_path_join(src, path) yield ExternalURL(source) elif src.startswith('hdfs'): for source, size in luigi.hdfs.listdir(src, recursive=True, include_size=True): if not self.include_zero_length and size == 0: continue elif any(fnmatch.fnmatch(source, include_val) for include_val in self.include): yield ExternalURL(source) else: # Apply the include patterns to the relative path below the src directory. # TODO: implement exclude_zero_length to match S3 case. for dirpath, _dirnames, files in os.walk(src): for filename in files: filepath = os.path.join(dirpath, filename) relpath = os.path.relpath(filepath, src) if any(fnmatch.fnmatch(relpath, include_val) for include_val in self.include): yield ExternalURL(filepath)
def get_target_class_from_url(url, marker=False): """Returns a luigi target class based on the url scheme""" parsed_url = urlparse.urlparse(url) if marker: target_class = URL_SCHEME_TO_MARKER_TARGET_CLASS.get( parsed_url.scheme, DEFAULT_MARKER_TARGET_CLASS) else: target_class = URL_SCHEME_TO_TARGET_CLASS.get(parsed_url.scheme, DEFAULT_TARGET_CLASS) kwargs = {} if issubclass(target_class, HdfsTarget) and url.endswith('/'): kwargs['format'] = hdfs_format.PlainDir if issubclass(target_class, luigi.LocalTarget) or parsed_url.scheme == 'hdfs': # LocalTarget and HdfsTarget both expect paths without any scheme, netloc etc, just bare paths. So strip # everything else off the url and pass that in to the target. url = parsed_url.path if issubclass(target_class, S3Target): kwargs['client'] = ScalableS3Client() kwargs['policy'] = DEFAULT_KEY_ACCESS_POLICY url = url.rstrip('/') args = (url, ) return target_class, args, kwargs
def _get_s3_urls(self, source): """Recursively list all files inside the source URL directory.""" s3_conn = ScalableS3Client().s3 bucket_name, root = get_s3_bucket_key_names(source) bucket = s3_conn.get_bucket(bucket_name) for key_metadata in bucket.list(root): if key_metadata.size > 0: key_path = key_metadata.key[len(root):].lstrip('/') yield url_path_join(source, key_path)
def when_s3_available(function): s3_available = getattr(when_s3_available, 's3_available', None) if s3_available is None: try: connection = ScalableS3Client().s3 # ^ The above line will not error out if AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY # are set, so it can't be used to check if we have a valid connection to S3. Instead: connection.get_all_buckets() except (boto.exception.S3ResponseError, boto.exception.NoAuthHandlerFound): s3_available = False else: s3_available = True finally: when_s3_available.s3_available = s3_available # Cache result to avoid having to compute it again return unittest.skipIf(not s3_available, 'S3 is not available')(function)
def validate_exporter_output(self, org_id, exported_filename): """ Preconditions: A complete data package has been uploaded to S3. External Effect: Downloads the complete data package, decompresses it, decrypts it and then compares it to the static expected output ignoring the ordering of the records in both files. Downloads s3://<exporter_output_bucket>/<output_prefix><org_id>-<year>-<month>-<day>.zip to <temporary_dir>/work/validation/. """ today = datetime.datetime.utcnow().strftime('%Y-%m-%d') bucket = ScalableS3Client().s3.get_bucket( self.config.get('exporter_output_bucket')) export_id = '{org}-{date}'.format(org=org_id, date=today) filename = export_id + '.zip' key = bucket.lookup(self.output_prefix + filename) if key is None: self.fail( 'Expected output from legacy exporter not found. Url = s3://{bucket}/{pre}{filename}' .format(bucket=self.config.get('exporter_output_bucket'), pre=self.output_prefix, filename=filename)) exporter_archive_path = os.path.join(self.validation_dir, filename) key.get_contents_to_filename(exporter_archive_path) shell.run(['unzip', exporter_archive_path, '-d', self.validation_dir]) gpg = gnupg.GPG(gnupghome=self.gpg_dir) with open(os.path.join('gpg-keys', 'insecure_secret.key'), 'r') as key_file: gpg.import_keys(key_file.read()) exported_file_path = os.path.join(self.validation_dir, exported_filename) with open( os.path.join(self.validation_dir, export_id, exported_filename + '.gpg'), 'r') as encrypted_file: gpg.decrypt_file(encrypted_file, output=exported_file_path) sorted_filename = exported_file_path + '.sorted' shell.run(['sort', '-o', sorted_filename, exported_file_path]) expected_output_path = os.path.join(self.data_dir, 'output', exported_filename + '.sorted') shell.run(['diff', sorted_filename, expected_output_path])
def setUp(self): try: self.s3_client = ScalableS3Client() except Exception: self.s3_client = None self.config = get_test_config() for env_var in ('TASKS_REPO', 'TASKS_BRANCH', 'IDENTIFIER', 'JOB_FLOW_NAME', 'IS_REMOTE'): if env_var in os.environ: self.config[env_var.lower()] = os.environ[env_var] if 'is_remote' in self.config: self.config['is_remote'] = self.config['is_remote'].lower( ) not in ('0', 'false', 'f') else: self.config['is_remote'] = True if self.config['is_remote']: # The name of an existing job flow to run the test on assert ('job_flow_name' in self.config or 'host' in self.config) # The git URL of the pipeline repository to check this code out from. assert ('tasks_repo' in self.config) # The branch of the pipeline repository to test. Note this can differ from the branch that is currently # checked out and running this code. assert ('tasks_branch' in self.config) # Where to store logs generated by the pipeline assert ('tasks_log_path' in self.config) # The user to connect to the job flow over SSH with. assert ('connection_user' in self.config) # Where the pipeline should output data, should be a URL pointing to a directory. assert ('tasks_output_url' in self.config) # Allow for parallel execution of the test by specifying a different identifier. Using an identical identifier # allows for old virtualenvs to be reused etc, which is why a random one is not simply generated with each run. assert ('identifier' in self.config) # A URL to a JSON file that contains most of the connection information for the MySQL database. assert ('credentials_file_url' in self.config) # A URL to a build of the oddjob third party library assert 'oddjob_jar' in self.config # A URL to a maxmind compatible geolocation database file assert 'geolocation_data' in self.config self.data_dir = os.path.join(os.path.dirname(__file__), 'fixtures') url = self.config['tasks_output_url'] m = hashlib.md5() m.update(self.config['identifier']) self.identifier = m.hexdigest() self.test_root = url_path_join(url, self.identifier, self.__class__.__name__) self.test_src = url_path_join(self.test_root, 'src') self.test_out = url_path_join(self.test_root, 'out') # Use a local dir for devstack testing, or s3 for production testing. self.report_output_root = self.config.get( 'report_output_root', url_path_join(self.test_out, 'reports')) self.catalog_path = 'http://acceptance.test/api/courses/v2' database_name = 'test_' + self.identifier schema = 'test_' + self.identifier import_database_name = 'acceptance_import_' + database_name export_database_name = 'acceptance_export_' + database_name otto_database_name = 'acceptance_otto_' + database_name elasticsearch_alias = 'alias_test_' + self.identifier self.warehouse_path = url_path_join(self.test_root, 'warehouse') self.edx_rest_api_cache_root = url_path_join(self.test_src, 'edx-rest-api-cache') task_config_override = { 'hive': { 'database': database_name, 'warehouse_path': self.warehouse_path }, 'map-reduce': { 'marker': url_path_join(self.test_root, 'marker') }, 'manifest': { 'path': url_path_join(self.test_root, 'manifest'), 'lib_jar': self.config['oddjob_jar'], }, 'database-import': { 'credentials': self.config['credentials_file_url'], 'destination': self.warehouse_path, 'database': import_database_name }, 'database-export': { 'credentials': self.config['credentials_file_url'], 'database': export_database_name }, 'otto-database-import': { 'credentials': self.config['credentials_file_url'], 'database': otto_database_name }, 'course-catalog': { 'catalog_path': self.catalog_path }, 'geolocation': { 'geolocation_data': self.config['geolocation_data'] }, 'event-logs': { 'source': as_list_param(self.test_src, escape_quotes=False), 'pattern': as_list_param(".*tracking.log-(?P<date>\\d{8}).*\\.gz", escape_quotes=False), }, 'segment-logs': { 'source': as_list_param(self.test_src, escape_quotes=False), 'pattern': as_list_param(".*segment.log-(?P<date>\\d{8}).*\\.gz", escape_quotes=False), }, 'course-structure': { 'api_root_url': 'acceptance.test', 'access_token': 'acceptance' }, 'module-engagement': { 'alias': elasticsearch_alias }, 'elasticsearch': {}, 'problem-response': { 'report_fields': '["username","problem_id","answer_id","location","question","score","max_score",' '"correct","answer","total_attempts","first_attempt_date","last_attempt_date"]', 'report_field_list_delimiter': '"|"', 'report_field_datetime_format': '%Y-%m-%dT%H:%M:%SZ', 'report_output_root': self.report_output_root, 'partition_format': '%Y-%m-%dT%H', }, 'edx-rest-api': { 'client_id': 'oauth_id', 'client_secret': 'oauth_secret', 'oauth_username': '******', 'oauth_password': '******', 'auth_url': 'http://acceptance.test', }, 'course-blocks': { 'api_root_url': 'http://acceptance.test/api/courses/v1/blocks/', }, 'course-list': { 'api_root_url': 'http://acceptance.test/api/courses/v1/courses/', }, } if 'elasticsearch_host' in self.config: task_config_override['elasticsearch']['host'] = as_list_param( self.config['elasticsearch_host'], escape_quotes=False) if 'elasticsearch_connection_class' in self.config: task_config_override['elasticsearch'][ 'connection_type'] = self.config[ 'elasticsearch_connection_class'] if 'manifest_input_format' in self.config: task_config_override['manifest']['input_format'] = self.config[ 'manifest_input_format'] if 'hive_version' in self.config: task_config_override['hive']['version'] = self.config[ 'hive_version'] log.info('Running test: %s', self.id()) log.info('Using executor: %s', self.config['identifier']) log.info('Generated Test Identifier: %s', self.identifier) self.import_db = db.DatabaseService(self.config, import_database_name) self.export_db = db.DatabaseService(self.config, export_database_name) self.otto_db = db.DatabaseService(self.config, otto_database_name) self.task = task.TaskService(self.config, task_config_override, self.identifier) self.hive = hive.HiveService(self.task, self.config, database_name) self.elasticsearch = elasticsearch_service.ElasticsearchService( self.config, elasticsearch_alias) self.reset_external_state() max_diff = os.getenv('MAX_DIFF', None) if max_diff is not None: if max_diff.lower() == "infinite": self.maxDiff = None else: self.maxDiff = int(max_diff)