def _set_workflow_config(self, yaml_file): # Receives a yaml file path with Workflow configurations and sets appropriate values for properties in this class log.info("Setting configurations from config file {0}".format(yaml_file)) try: config = None with open(yaml_file, "r") as ymlfile: config = yaml.load(ymlfile) self._source = config["source"] self._destination = config["destination"] self._io_reader = Factory.get_reader(self._source["type"], self._source) self._io_writer = Factory.get_writer( self._destination["type"], self._destination ) # Set attributes for custom workflow properties for key in config.keys(): if key not in self.DEFAULT_PARAMS: setattr(self, key, config[key]) except Exception: log.error( "Error creating I/O reader and writer. Please check configurations in workflow config file at {0}".format( yaml_file ) ) raise
def __init__(self, name, source=None, destination=None): # Initialize properties self._source = None self._destination = None self._name = name # Check to see if workflow yaml file exists. If so, set workflow configurations from file. default_filepath = self._get_default_filepath(name) backup_filepath = self._get_backup_filepath(name) if os.path.exists(default_filepath): log.info("Config file detected: {0}".format(default_filepath)) self._set_workflow_config(default_filepath) elif os.path.exists(backup_filepath): log.info("Config file detected: {0}".format(backup_filepath)) self._set_workflow_config(backup_filepath) else: log.info("No config file detected.") # If source or destination are passed in as parameters, update source and dest configurations. if source: self._source = source self._io_reader = Factory.get_reader(self._source["type"], self._source) if destination: self._destination = destination self._io_writer = Factory.get_writer(self._destination["type"], self._destination)
def set_source(self, source): """ Set source. :param source: dict of configuration parameters for data source (reader) """ self._source = source self._io_reader = Factory.get_reader(self.source["type"], self.source)
def set_destination(self, destination): """ Set destination. :param destination: dict of configuration parameters for the destination (writer) """ self._destination = destination self._io_writer = Factory.get_writer(self.source["destination"], self.destination)
def test_get_reader_orc(test_input_base_path, expected_df): test_input_path = "%s/person.orc" % (test_input_base_path) config = { "type": "fs", "input_path": test_input_path, "usecols": ["firstname", "lastname", "gender"], "input_format": "orc", } reader_from_factory = Factory.get_reader("fs", config) fetched_df = reader_from_factory.fetch_data() assert fetched_df.equals(expected_df)
def test_get_reader_json(tmpdir, expected_df): fname = str(tmpdir.mkdir("tmp_test_fs_reader").join("person.json")) cudf.io.json.to_json(expected_df, fname, orient="records") config = { "type": "fs", "input_path": fname, "orient": "records", "input_format": "json", } reader_from_factory = Factory.get_reader("fs", config) fetched_df = reader_from_factory.fetch_data() assert fetched_df.equals(expected_df)
def test_get_reader_orc(tmpdir, expected_df): fname = str(tmpdir.mkdir("tmp_test_fs_reader").join("person.orc")) cudf.io.orc.to_orc(expected_df, fname) config = { "type": "fs", "input_path": fname, "usecols": ["firstname", "lastname", "gender"], "input_format": "orc", } reader_from_factory = Factory.get_reader("fs", config) fetched_df = reader_from_factory.fetch_data() assert fetched_df.equals(expected_df)
def test_get_reader_text(test_input_base_path, expected_df): test_input_path = "%s/person.csv" % (test_input_base_path) config = { "type": "fs", "input_path": test_input_path, "names": ["firstname", "lastname", "gender"], "delimiter": ",", "usecols": ["firstname", "lastname", "gender"], "dtype": ["str", "str", "str"], "header": 0, "input_format": "text", } reader_from_factory = Factory.get_reader("fs", config) fetched_df = reader_from_factory.fetch_data() assert fetched_df.equals(expected_df)
def test_get_reader_csv(tmpdir, expected_df): fname = tmpdir.mkdir("tmp_test_factory").join("person.csv") expected_df.to_csv(fname, index=False) config = { "type": "fs", "input_path": fname, "names": ["firstname", "lastname", "gender"], "delimiter": ",", "usecols": ["firstname", "lastname", "gender"], "dtype": ["str", "str", "str"], "header": 0, "input_format": "csv", } reader_from_factory = Factory.get_reader("fs", config) fetched_df = reader_from_factory.fetch_data() assert fetched_df.equals(expected_df)
def test_get_io_writer_fs(fs_writer_config): writer = Factory.get_writer("fs", fs_writer_config) expected_cls = FileSystemWriter assert isinstance(writer, expected_cls)
def test_get_io_reader_fs(fs_reader_config): reader = Factory.get_reader("fs", fs_reader_config) expected_cls = FileSystemReader assert isinstance(reader, expected_cls)
def test_get_io_writer_kafka(kafka_config): writer = Factory.get_writer("kafka", kafka_config) expected_cls = KafkaWriter assert isinstance(writer, expected_cls)
def test_get_io_reader_kafka(kafka_config): reader = Factory.get_reader("kafka", kafka_config) expected_cls = KafkaReader assert isinstance(reader, expected_cls)
def set_destination(self, destination): self._destination = destination self._io_writer = Factory.get_writer(self.source["destination"], self.destination)
def set_source(self, source): self._source = source self._io_reader = Factory.get_reader(self.source["type"], self.source)