Ejemplo n.º 1
0
def download_from_s3(context):
    (bucket, key, target_folder,
     skip_if_present) = (context.solid_config.get(k)
                         for k in ('bucket', 'key', 'target_folder',
                                   'skip_if_present'))

    # file name is S3 key path suffix after last /
    target_file = os.path.join(target_folder, key.split('/')[-1])

    if skip_if_present and safe_isfile(target_file):
        context.log.info(
            'Skipping download, file already present at {target_file}'.format(
                target_file=target_file))
    else:
        if not os.path.exists(target_folder):
            mkdir_p(target_folder)

        context.log.info(
            'Starting download of {bucket}/{key} to {target_file}'.format(
                bucket=bucket, key=key, target_file=target_file))
        s3 = boto3.client('s3')

        headers = s3.head_object(Bucket=bucket, Key=key)
        logger = S3Logger(context.log.debug, bucket, key, target_file,
                          int(headers['ContentLength']))
        s3.download_file(Bucket=bucket,
                         Key=key,
                         Filename=target_file,
                         Callback=logger)

    return target_file
Ejemplo n.º 2
0
    def set_asset(self, context, step_output_handle, obj, asset_metadata):
        """Pickle the data and store the object to a custom file path.

        This method emits an AssetMaterialization event so the assets will be tracked by the
        Asset Catalog.
        """
        check.inst_param(step_output_handle, "step_output_handle", StepOutputHandle)
        path = check.str_param(asset_metadata.get("path"), "asset_metadata.path")

        filepath = self._get_path(path)

        # Ensure path exists
        mkdir_p(os.path.dirname(filepath))

        with open(filepath, self.write_mode) as write_obj:
            pickle.dump(obj, write_obj, PICKLE_PROTOCOL)

        return AssetMaterialization(
            asset_key=AssetKey(
                [
                    context.pipeline_def.name,
                    step_output_handle.step_key,
                    step_output_handle.output_name,
                ]
            ),
            metadata_entries=[EventMetadataEntry.fspath(os.path.abspath(filepath))],
        )
Ejemplo n.º 3
0
    def _load_schedules(self):
        utils.mkdir_p(self._artifacts_dir)

        for file in os.listdir(self._artifacts_dir):
            if not file.endswith('.json'):
                continue
            file_path = os.path.join(self._artifacts_dir, file)
            with open(file_path) as data:
                try:
                    data = seven.json.load(data)
                    schedule = RunningSchedule(
                        data['schedule_id'],
                        ScheduleDefinition(
                            name=data['name'],
                            cron_schedule=data['cron_schedule'],
                            execution_params=data['execution_params'],
                        ),
                        python_path=data['python_path'],
                        repository_path=data['repository_path'],
                    )
                    self._schedules[
                        schedule.schedule_definition.name] = schedule

                except Exception as ex:  # pylint: disable=broad-except
                    six.raise_from(
                        Exception(
                            'Could not parse dagit schedule from {file_name} in {dir_name}. {ex}: {msg}'
                            .format(
                                file_name=file,
                                dir_name=self._artifacts_dir,
                                ex=type(ex).__name__,
                                msg=ex,
                            )),
                        ex,
                    )
Ejemplo n.º 4
0
    def __init__(self, base_dir, inst_data=None):
        """Note that idempotent initialization of the SQLite database is done on a per-run_id
        basis in the body of connect, since each run is stored in a separate database."""
        self._base_dir = os.path.abspath(check.str_param(base_dir, "base_dir"))
        mkdir_p(self._base_dir)

        self._obs = None

        self._watchers = defaultdict(dict)
        self._inst_data = check.opt_inst_param(inst_data, "inst_data",
                                               ConfigurableClassData)

        # Used to ensure that each run ID attempts to initialize its DB the first time it connects,
        # ensuring that the database will be created if it doesn't exist
        self._initialized_dbs = set()

        # Ensure that multiple threads (like the event log watcher) interact safely with each other
        self._db_lock = threading.Lock()

        if not os.path.exists(self.path_for_shard(INDEX_SHARD_NAME)):
            conn_string = self.conn_string_for_shard(INDEX_SHARD_NAME)
            engine = create_engine(conn_string, poolclass=NullPool)
            self._initdb(engine)
            self.reindex()

        super().__init__()
Ejemplo n.º 5
0
Archivo: cli.py Proyecto: nikie/dagster
def get_config_dir(config_yaml=None):
    instance = DagsterInstance.get()
    config_type = celery_executor.config_field.config_type
    config_value = get_config_value_from_yaml(config_yaml)

    config_module_name = 'dagster_celery_config'

    config_dir = os.path.join(instance.root_directory, 'dagster_celery',
                              'config', str(uuid.uuid4()))
    mkdir_p(config_dir)
    config_path = os.path.join(
        config_dir, '{config_module_name}.py'.format(
            config_module_name=config_module_name))
    validated_config = validate_config(config_type, config_value).value
    with open(config_path, 'w') as fd:
        if 'broker' in validated_config:
            fd.write('broker_url = \'{broker_url}\'\n'.format(
                broker_url=str(validated_config['broker'])))
        if 'backend' in validated_config:
            fd.write('result_backend = \'{result_backend}\'\n'.format(
                result_backend=str(validated_config['backend'])))
        if 'config_source' in validated_config:
            for key, value in validated_config['config_source'].items():
                fd.write('{key} = {value}\n'.format(key=key,
                                                    value=repr(value)))

    # n.b. right now we don't attempt to clean up this cache, but it might make sense to delete
    # any files older than some time if there are more than some number of files present, etc.
    return config_dir
Ejemplo n.º 6
0
    def download_file(self, context, target_file):
        check.str_param(target_file, 'target_file')

        target_path = os.path.join(self.target_folder, target_file)

        if self.skip_if_present and safe_isfile(target_path):
            context.log.info(
                'Skipping download, file already present at {target_path}'.
                format(target_path=target_path))
        else:
            full_key = self.key + '/' + target_file
            if os.path.dirname(target_path):
                mkdir_p(os.path.dirname(target_path))

            context.log.info(
                'Starting download of {bucket}/{key} to {target_path}'.format(
                    bucket=self.bucket, key=full_key, target_path=target_path))

            headers = context.resources.s3.head_object(Bucket=self.bucket,
                                                       Key=full_key)
            logger = S3Logger(context.log.debug, self.bucket, full_key,
                              target_path, int(headers['ContentLength']))
            context.resources.s3.download_file(Bucket=self.bucket,
                                               Key=full_key,
                                               Filename=target_path,
                                               Callback=logger)

        return target_path
Ejemplo n.º 7
0
    def handle_output(self, context, obj):
        """Pickle the data and store the object to a custom file path.

        This method emits an AssetMaterialization event so the assets will be tracked by the
        Asset Catalog.
        """
        check.inst_param(context, "context", OutputContext)
        metadata = context.metadata
        path = check.str_param(metadata.get("path"), "metadata.path")

        filepath = self._get_path(path)

        # Ensure path exists
        mkdir_p(os.path.dirname(filepath))
        context.log.debug(f"Writing file at: {filepath}")

        with open(filepath, self.write_mode) as write_obj:
            pickle.dump(obj, write_obj, PICKLE_PROTOCOL)

        return AssetMaterialization(
            asset_key=AssetKey(
                [context.pipeline_name, context.step_key, context.name]),
            metadata_entries=[
                EventMetadataEntry.fspath(os.path.abspath(filepath))
            ],
        )
Ejemplo n.º 8
0
    def set_object(self,
                   key,
                   obj,
                   serialization_strategy=DEFAULT_SERIALIZATION_STRATEGY):
        check.str_param(key, 'key')
        # obj is an arbitrary Python object
        check.inst_param(serialization_strategy, 'serialization_strategy',
                         SerializationStrategy)

        if os.path.exists(key):
            logging.warning('Removing existing path {path}'.format(path=key))
            os.unlink(key)

        # Ensure path exists
        mkdir_p(os.path.dirname(key))

        serialization_strategy.serialize_to_file(obj, key)

        return ObjectStoreOperation(
            op=ObjectStoreOperationType.SET_OBJECT,
            key=key,
            dest_key=None,
            obj=obj,
            serialization_strategy_name=serialization_strategy.name,
            object_store_name=self.name,
        )
Ejemplo n.º 9
0
 def from_local(base_dir, inst_data=None):
     check.str_param(base_dir, 'base_dir')
     mkdir_p(base_dir)
     conn_string = 'sqlite:///{}'.format(os.path.join(base_dir, 'runs.db'))
     engine = create_engine(conn_string)
     RunStorageSQLMetadata.create_all(engine)
     return SqliteRunStorage(conn_string, inst_data)
Ejemplo n.º 10
0
    def _load_schedules(self):
        schedules_dir = os.path.join(self._base_dir)
        utils.mkdir_p(schedules_dir)

        for repository_name in os.listdir(schedules_dir):
            if not os.path.isdir(os.path.join(schedules_dir, repository_name)):
                continue

            self._schedules[repository_name] = {}
            for file in os.listdir(os.path.join(schedules_dir,
                                                repository_name)):
                if not file.endswith('.json'):
                    continue
                file_path = os.path.join(schedules_dir, repository_name, file)
                with open(file_path) as data:
                    try:
                        schedule = deserialize_json_to_dagster_namedtuple(
                            data.read())
                        self._schedules[repository_name][
                            schedule.name] = schedule

                    except Exception as ex:  # pylint: disable=broad-except
                        warnings.warn(
                            'Could not parse dagster schedule from {file_name} in {dir_name}. '
                            '{ex}: {msg}'.format(
                                file_name=file,
                                dir_name=self._base_dir,
                                ex=type(ex).__name__,
                                msg=ex,
                            ))
                        continue
Ejemplo n.º 11
0
def sftp_solid(context):
    '''
    Ported from Airflow's SFTPOperator.

    sftp_solid: for transferring files from remote host to local or vice a versa. This solid uses
    ssh_resource to open a SFTP transport channel that serve as basis for file transfer.
    '''
    local_filepath = context.solid_config.get('local_filepath')
    remote_filepath = context.solid_config.get('remote_filepath')
    operation = context.solid_config.get('operation')
    confirm = context.solid_config.get('confirm')

    with context.resources.ssh_resource.get_connection() as ssh_client:
        sftp_client = ssh_client.open_sftp()
        if operation == 'GET':
            local_folder = os.path.dirname(local_filepath)

            # Create intermediate directories if they don't exist
            mkdir_p(local_folder)

            context.log.info('Starting to transfer from {0} to {1}'.format(
                remote_filepath, local_filepath))
            sftp_client.get(remote_filepath, local_filepath)

        else:
            context.log.info(
                'Starting to transfer file from {0} to {1}'.format(
                    local_filepath, remote_filepath))

            sftp_client.put(local_filepath, remote_filepath, confirm=confirm)

    return local_filepath
Ejemplo n.º 12
0
    def ensure_base_dir_exists(self):
        if self._base_dir_ensured:
            return

        mkdir_p(self.base_dir)

        self._base_dir_ensured = True
Ejemplo n.º 13
0
def events_jar():
    git_repo_root = six.ensure_str(
        subprocess.check_output(['git', 'rev-parse',
                                 '--show-toplevel']).strip())

    temp_dir = os.path.join(get_system_temp_directory(),
                            'dagster_examples_tests',
                            'event_pipeline_demo_tests')

    mkdir_p(temp_dir)
    dst = os.path.join(temp_dir, 'events.jar')

    if os.path.exists(dst):
        print('events jar already exists, skipping')  # pylint: disable=print-call
    else:
        subprocess.check_call(['sbt', 'events/assembly'],
                              cwd=os.path.join(git_repo_root, 'scala_modules'))

        src = os.path.join(
            git_repo_root,
            'scala_modules',
            'events/target/scala-2.11/events-assembly-0.1.0-SNAPSHOT.jar',
        )
        subprocess.check_call(['cp', src, dst])

    yield dst
Ejemplo n.º 14
0
def fs_file_cache(init_context):
    target_folder = init_context.resource_config["target_folder"]

    if not os.path.exists(target_folder):
        mkdir_p(target_folder)

    return FSFileCache(target_folder=target_folder, overwrite=False)
Ejemplo n.º 15
0
Archivo: cli.py Proyecto: keyz/dagster
def get_config_dir(config_yaml=None):
    instance = DagsterInstance.get()

    config_module_name = "dagster_celery_config"

    config_dir = os.path.join(instance.root_directory, "dagster_celery",
                              "config", str(uuid.uuid4()))
    mkdir_p(config_dir)
    config_path = os.path.join(
        config_dir, "{config_module_name}.py".format(
            config_module_name=config_module_name))

    validated_config = get_validated_config(config_yaml)
    with open(config_path, "w") as fd:
        if "broker" in validated_config and validated_config["broker"]:
            fd.write("broker_url = '{broker_url}'\n".format(
                broker_url=str(validated_config["broker"])))
        if "backend" in validated_config and validated_config["backend"]:
            fd.write("result_backend = '{result_backend}'\n".format(
                result_backend=str(validated_config["backend"])))
        if "config_source" in validated_config and validated_config[
                "config_source"]:
            for key, value in validated_config["config_source"].items():
                fd.write("{key} = {value}\n".format(key=key,
                                                    value=repr(value)))

    # n.b. right now we don't attempt to clean up this cache, but it might make sense to delete
    # any files older than some time if there are more than some number of files present, etc.
    return config_dir
Ejemplo n.º 16
0
def events_jar():
    git_repo_root = six.ensure_str(
        subprocess.check_output(["git", "rev-parse",
                                 "--show-toplevel"]).strip())

    temp_dir = os.path.join(get_system_temp_directory(),
                            "dagster_examples_tests",
                            "event_pipeline_demo_tests")

    mkdir_p(temp_dir)
    dst = os.path.join(temp_dir, "events.jar")

    if os.path.exists(dst):
        print("events jar already exists, skipping")  # pylint: disable=print-call
    else:
        subprocess.check_call(["sbt", "events/assembly"],
                              cwd=os.path.join(git_repo_root, "scala_modules"))

        src = os.path.join(
            git_repo_root,
            "scala_modules",
            "events/target/scala-2.11/events-assembly-0.1.0-SNAPSHOT.jar",
        )
        subprocess.check_call(["cp", src, dst])

    yield dst
Ejemplo n.º 17
0
def _download_from_s3_to_file(session, context, bucket, key, target_folder, skip_if_present):
    # TODO: remove context argument once we support resource logging

    # file name is S3 key path suffix after last /
    target_file = os.path.join(target_folder, key.split('/')[-1])

    if skip_if_present and safe_isfile(target_file):
        context.log.info(
            'Skipping download, file already present at {target_file}'.format(
                target_file=target_file
            )
        )
    else:
        if not os.path.exists(target_folder):
            mkdir_p(target_folder)

        context.log.info(
            'Starting download of {bucket}/{key} to {target_file}'.format(
                bucket=bucket, key=key, target_file=target_file
            )
        )

        headers = session.head_object(Bucket=bucket, Key=key)
        logger = S3Logger(
            context.log.debug, bucket, key, target_file, int(headers['ContentLength'])
        )
        session.download_file(Bucket=bucket, Key=key, Filename=target_file, Callback=logger)
    return target_file
Ejemplo n.º 18
0
    def from_local(cls, base_dir, inst_data=None):
        check.str_param(base_dir, "base_dir")
        mkdir_p(base_dir)
        conn_string = create_db_conn_string(base_dir, "runs")
        engine = create_engine(conn_string, poolclass=NullPool)
        alembic_config = get_alembic_config(__file__)

        should_mark_indexes = False
        with engine.connect() as connection:
            db_revision, head_revision = check_alembic_revision(
                alembic_config, connection)
            if not (db_revision and head_revision):
                RunStorageSqlMetadata.create_all(engine)
                engine.execute("PRAGMA journal_mode=WAL;")
                stamp_alembic_rev(alembic_config, connection)
                should_mark_indexes = True

            table_names = db.inspect(engine).get_table_names()
            if "instance_info" not in table_names:
                InstanceInfo.create(engine)

        run_storage = cls(conn_string, inst_data)

        if should_mark_indexes:
            run_storage.migrate()
            run_storage.optimize()

        return run_storage
Ejemplo n.º 19
0
def get_papermill_parameters(compute_context, inputs, output_log_path):
    check.inst_param(compute_context, 'compute_context',
                     SystemComputeExecutionContext)
    check.param_invariant(
        isinstance(compute_context.environment_dict, dict),
        'compute_context',
        'SystemComputeExecutionContext must have valid environment_dict',
    )
    check.dict_param(inputs, 'inputs', key_type=six.string_types)

    run_id = compute_context.run_id

    marshal_dir = '/tmp/dagstermill/{run_id}/marshal'.format(run_id=run_id)
    mkdir_p(marshal_dir)

    (handle, solid_subset) = ExecutionTargetHandle.get_handle(
        compute_context.pipeline_def)

    if not handle:
        raise DagstermillError(
            'Can\'t execute a dagstermill solid from a pipeline that wasn\'t instantiated using '
            'an ExecutionTargetHandle')

    dm_handle_kwargs = handle.data._asdict()

    dm_handle_kwargs['pipeline_name'] = compute_context.pipeline_def.name

    dm_context_dict = {
        'output_log_path': output_log_path,
        'marshal_dir': marshal_dir,
        'environment_dict': compute_context.environment_dict,
    }

    dm_solid_handle_kwargs = compute_context.solid_handle._asdict()

    parameters = {}

    input_def_dict = compute_context.solid_def.input_dict
    for input_name, input_value in inputs.items():
        assert (
            input_name not in RESERVED_INPUT_NAMES
        ), 'Dagstermill solids cannot have inputs named {input_name}'.format(
            input_name=input_name)
        dagster_type = input_def_dict[input_name].dagster_type
        parameter_value = write_value(
            dagster_type, input_value,
            os.path.join(marshal_dir, 'input-{}'.format(input_name)))
        parameters[input_name] = parameter_value

    parameters['__dm_context'] = dm_context_dict
    parameters['__dm_handle_kwargs'] = dm_handle_kwargs
    parameters['__dm_pipeline_run_dict'] = pack_value(
        compute_context.pipeline_run)
    parameters['__dm_solid_handle_kwargs'] = dm_solid_handle_kwargs
    parameters['__dm_solid_subset'] = solid_subset
    parameters['__dm_instance_ref_dict'] = pack_value(
        compute_context.instance.get_ref())

    return parameters
Ejemplo n.º 20
0
 def __init__(self, base_dir=None):
     self._base_dir = check.opt_str_param(base_dir, 'base_dir',
                                          base_runs_directory())
     mkdir_p(self._base_dir)
     self.file_cursors = defaultdict(lambda: (0, 0))
     # Swap these out to use lockfiles
     self.file_lock = defaultdict(gevent.lock.Semaphore)
     self._metadata_file_lock = defaultdict(gevent.lock.Semaphore)
Ejemplo n.º 21
0
 def set_intermediate_object(cls, intermediate_storage, context,
                             dagster_type, step_output_handle, value):
     paths = [
         'intermediates', step_output_handle.step_key,
         step_output_handle.output_name
     ]
     paths.append(value)
     mkdir_p(os.path.join(intermediate_storage.root, *paths))
Ejemplo n.º 22
0
    def __init__(self, base_dir):
        self._base_dir = check.str_param(base_dir, 'base_dir')
        mkdir_p(self._base_dir)

        self._known_run_ids = set([])
        self._watchers = {}
        self._obs = Observer()
        self._obs.start()
Ejemplo n.º 23
0
    def _get_bash_script_file_path(self, instance, repository, schedule):
        check.inst_param(instance, 'instance', DagsterInstance)

        script_directory = os.path.join(instance.schedules_directory(), "scripts")
        utils.mkdir_p(script_directory)

        script_file_name = "{}.{}.sh".format(repository.name, schedule.name)
        return os.path.join(script_directory, script_file_name)
Ejemplo n.º 24
0
 def __init__(self, bucket_name, volume):
     self.bucket_name = check.str_param(bucket_name, "bucket_name")
     # Setup bucket
     self.volume = os.path.join(tempfile.gettempdir(), check.str_param(volume, "volume"))
     bucket_location = os.path.join(self.volume, self.bucket_name)
     if not os.path.exists(bucket_location):
         mkdir_p(bucket_location)
     self.location = bucket_location
     self.blobs = {}
Ejemplo n.º 25
0
 def __init__(self, base_dir):
     self._base_dir = check.str_param(base_dir, 'base_dir')
     mkdir_p(self._base_dir)
     self.file_cursors = defaultdict(lambda: (0, 0))
     # Swap these out to use lockfiles
     self.file_lock = defaultdict(gevent.lock.Semaphore)
     self._watchers = {}
     self._obs = Observer()
     self._obs.start()
Ejemplo n.º 26
0
    def _get_bash_script_file_path(self, instance, schedule_origin_id):
        check.inst_param(instance, "instance", DagsterInstance)
        check.str_param(schedule_origin_id, "schedule_origin_id")

        script_directory = os.path.join(instance.schedules_directory(), "scripts")
        utils.mkdir_p(script_directory)

        script_file_name = "{}.sh".format(schedule_origin_id)
        return os.path.join(script_directory, script_file_name)
Ejemplo n.º 27
0
    def write_dagster_run_meta(self, dagster_run_meta):
        check.inst_param(dagster_run_meta, 'dagster_run_meta', DagsterRunMeta)

        run_dir = os.path.join(self._base_dir, dagster_run_meta.run_id)

        mkdir_p(run_dir)

        with open(self._meta_file, 'a+') as ff:
            ff.write(seven.json.dumps(dagster_run_meta._asdict()) + '\n')
Ejemplo n.º 28
0
    def _get_or_create_logs_directory(self, instance, schedule_origin_id):
        check.inst_param(instance, "instance", DagsterInstance)
        check.str_param(schedule_origin_id, "schedule_origin_id")

        logs_directory = os.path.join(instance.schedules_directory(), "logs", schedule_origin_id)
        if not os.path.isdir(logs_directory):
            utils.mkdir_p(logs_directory)

        return logs_directory
Ejemplo n.º 29
0
def get_papermill_parameters(step_context, inputs, output_log_path):
    check.inst_param(step_context, "step_context", StepExecutionContext)
    check.param_invariant(
        isinstance(step_context.run_config, dict),
        "step_context",
        "StepExecutionContext must have valid run_config",
    )
    check.dict_param(inputs, "inputs", key_type=str)

    run_id = step_context.run_id
    temp_dir = get_system_temp_directory()
    marshal_dir = os.path.normpath(
        os.path.join(temp_dir, "dagstermill", str(run_id), "marshal"))
    mkdir_p(marshal_dir)

    if not isinstance(step_context.pipeline, ReconstructablePipeline):
        raise DagstermillError(
            "Can't execute a dagstermill solid from a pipeline that is not reconstructable. "
            "Use the reconstructable() function if executing from python")

    dm_executable_dict = step_context.pipeline.to_dict()

    dm_context_dict = {
        "output_log_path": output_log_path,
        "marshal_dir": marshal_dir,
        "run_config": step_context.run_config,
    }

    dm_solid_handle_kwargs = step_context.solid_handle._asdict()

    parameters = {}

    input_def_dict = step_context.solid_def.input_dict
    for input_name, input_value in inputs.items():
        assert (
            input_name not in RESERVED_INPUT_NAMES
        ), "Dagstermill solids cannot have inputs named {input_name}".format(
            input_name=input_name)
        dagster_type = input_def_dict[input_name].dagster_type
        parameter_value = write_value(
            dagster_type,
            input_value,
            os.path.join(
                marshal_dir,
                f"{str(step_context.solid_handle)}-input-{input_name}"),
        )
        parameters[input_name] = parameter_value

    parameters["__dm_context"] = dm_context_dict
    parameters["__dm_executable_dict"] = dm_executable_dict
    parameters["__dm_pipeline_run_dict"] = pack_value(
        step_context.pipeline_run)
    parameters["__dm_solid_handle_kwargs"] = dm_solid_handle_kwargs
    parameters["__dm_instance_ref_dict"] = pack_value(
        step_context.instance.get_ref())

    return parameters
Ejemplo n.º 30
0
def temp_dir():
    '''Context manager for temporary directories.

    pytest implicitly wraps in try/except.
    '''
    dir_path = os.path.join('/tmp', str(uuid.uuid4()))
    mkdir_p(dir_path)
    yield dir_path
    shutil.rmtree(dir_path)