Exemple #1
0
    def __init__(
        self,
        url: str,
        table: str,
        credentials: Optional[Dict[str, Any]] = None,
        load_args: Optional[Dict[str, Any]] = None,
        save_args: Optional[Dict[str, Any]] = None,
    ) -> None:
        """Creates a new ``SparkJDBCDataSet``.

        Args:
            url: A JDBC URL of the form ``jdbc:subprotocol:subname``.
            table: The name of the table to load or save data to.
            credentials: A dictionary of JDBC database connection arguments.
                Normally at least properties ``user`` and ``password`` with
                their corresponding values.  It updates ``properties``
                parameter in ``load_args`` and ``save_args`` in case it is
                provided.
            load_args: Provided to underlying PySpark ``jdbc`` function along
                with the JDBC URL and the name of the table. To find all
                supported arguments, see here:
                https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=jdbc#pyspark.sql.DataFrameReader.jdbc
            save_args: Provided to underlying PySpark ``jdbc`` function along
                with the JDBC URL and the name of the table. To find all
                supported arguments, see here:
                https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=jdbc#pyspark.sql.DataFrameWriter.jdbc

        Raises:
            DataSetError: When either ``url`` or ``table`` is empty.

        """

        if not url:
            raise DataSetError(
                "`url` argument cannot be empty. Please "
                "provide a JDBC URL of the form "
                "``jdbc:subprotocol:subname``."
            )

        if not table:
            raise DataSetError(
                "`table` argument cannot be empty. Please "
                "provide the name of the table to load or save "
                "data to."
            )

        self._url = url
        self._table = table
        self._load_args = load_args if load_args is not None else {}
        self._save_args = save_args if save_args is not None else {}

        # Update properties in load_args and save_args with credentials.
        if credentials is not None:
            load_properties = self._load_args.get("properties", {})
            save_properties = self._save_args.get("properties", {})
            self._load_args["properties"] = {**load_properties, **credentials}
            self._save_args["properties"] = {**save_properties, **credentials}
    def _save(self, data) -> None:

        if self._mutlifile_mode:

            if not os.path.isdir(self._filepath):
                os.makedirs(self._filepath)

            if isinstance(data, list):
                for index, plot in enumerate(data):
                    plot.savefig(os.path.join(self._filepath, str(index)),
                                 **self._save_args)

            elif isinstance(data, dict):
                for plot_name, plot in data.items():
                    plot.savefig(os.path.join(self._filepath, plot_name),
                                 **self._save_args)

            else:
                plot_type = type(data)
                raise DataSetError(
                    ("multiFile is True but data type "
                     "not dict or list. Rather, {}".format(plot_type)))

        else:
            data.savefig(self._filepath, **self._save_args)
    def _calculate_expiration_hours(expiration: Union[str, int]) -> int:
        """
        Determine the number of hours that is meant by a string or int

        Parameters:
        ----------
        expiration: str or int
            if this value is an int, it is assumed to be the number of hours
            if this value is a string, then it can be represented as a number followed by
            the units (H)ours, (D)ays, (M)inutes

        Returns:
        --------
        int
            number of hours represented by the string interpretation
        """
        regex = r"(\d*)\s*([H|M|D]?)"
        result, isint = intTryParse(expiration)
        # What was passed in wasn't just an int as a string
        if not isint:
            try:
                expression = str(expiration)
                num, letter = re.findall(regex, expression, re.IGNORECASE)[0]
                num = int(num)
                conversion = {"H": 1, "D": 24, "M": 1 / 60}
                result = int(num * conversion.get(letter.upper(), 24))
            except ValueError:
                DataSetError("Invalid expiration")

        return result
    def _load(self) -> Dict[str, Callable[[], Any]]:
        """
        This function will look to either the local datasource or to the web for the
        requested data.  If the dates requested aren't available, then we'll go to the web and pick them up
        """
        partitions = super()._load()
        DEBUG(f"Found {len(partitions)} partitions in {self._path}")
        if not partitions:
            raise DataSetError(f"No partitions found in `{self._path}`")

        # The partitions we have
        existing_partitions = {_ for _ in partitions.keys()}
        seasons_requested = self._gen_requested_partitions(self._date_range)
        missing_partitions = set(
            seasons_requested.keys()) - existing_partitions
        # If we have found any missing data then go get it, otherwise assume we are good
        if len(missing_partitions) > 0:
            DEBUG(f"Found {len(missing_partitions)} partitions missing")
            missing_years = list(
                {seasons_requested[y][0]
                 for y in missing_partitions})
            missing_weeks = list(
                {seasons_requested[w][1]
                 for w in missing_partitions})
            missing_data = self._get_missing_data(missing_years, missing_weeks)
            self._stash_missing_data(missing_data)

            # This is a hack - requring a second reload of the data, but it might be worth it
            super()._invalidate_caches()
            partitions = super()._load()
        else:
            DEBUG("No missing partitions")

        return partitions
    def _build_args_list_from_metric_item(
            self, key: str,
            value: MetricItem) -> Generator[MetricTuple, None, None]:
        """Build list of tuples with metrics.

        First element of a tuple is key, second metric value, third step.

        If MLflow metrics dataset has prefix, it will be attached to key.

        Args:
            key (str): Metric key.
            value (MetricItem): Metric value

        Returns:
            List[MetricTuple]: List with metrics as tuples.
        """
        if self._prefix:
            key = f"{self._prefix}.{key}"
        if isinstance(value, dict):
            return (i for i in [(key, value["value"], value["step"])])
        if isinstance(value, list) and len(value) > 0:
            return ((key, x["value"], x["step"]) for x in value)
        raise DataSetError(
            f"Unexpected metric value. Should be of type `{MetricItem}`, got {type(value)}"
        )
 def _load(self) -> DataFrame:
     if not self._exists():
         raise DataSetError(
             "requested table not found: {database}.{table}".format(
                 database=self._database, table=self._table))
     return self._get_spark().sql("select * from {database}.{table}".format(
         database=self._database, table=self._table))
    def __init__(self,
                 database: str,
                 table: str,
                 write_mode: str,
                 table_pk: List[str] = None) -> None:
        """Creates a new instance of ``SparkHiveDataSet``.

        Args:
            database: The name of the hive database.
            table: The name of the table within the database.
            write_mode: ``insert``, ``upsert`` or ``overwrite`` are supported.
            table_pk: If performing an upsert, this identifies the primary key columns used to
                resolve preexisting data. Is required for ``write_mode="upsert"``

        Raises:
            DataSetError: Invalid configuration supplied
        """
        self._database = database
        self._table = table
        self._stage_table = "_temp_" + table
        self._valid_write_modes = ["insert", "upsert", "overwrite"]
        if write_mode not in self._valid_write_modes:
            raise DataSetError(
                "Invalid write_mode provided: {invalid}. Write_mode must be one of {valid}"
                .format(invalid=write_mode, valid=self._valid_write_modes))
        self._write_mode = write_mode
        if self._write_mode == "upsert" and not table_pk:
            raise DataSetError(
                "table_pk must be set to utilise upsert read mode")
        self._table_pk = table_pk

        self._table_columns = self._load().columns if self._exists() else None

        if (self._table_pk and self._exists()
                and set(self._table_pk) - set(self._table_columns)):
            raise DataSetError(
                "columns [{colnames}] selected as PK not found in table {database}.{table}"
                .format(
                    colnames=", ".join(
                        sorted(set(self._table_pk) -
                               set(self._table_columns))),
                    database=self._database,
                    table=self._table,
                ))
 def _validate_save(self, data: DataFrame):
     hive_dtypes = set(self._load().dtypes)
     data_dtypes = set(data.dtypes)
     if data_dtypes != hive_dtypes:
         new_cols = data_dtypes - hive_dtypes
         missing_cols = hive_dtypes - data_dtypes
         raise DataSetError("dataset does not match hive table schema.\n"
                            "Present on insert only: {new_cols}\n"
                            "Present on schema only: {missing_cols}".format(
                                new_cols=sorted(new_cols),
                                missing_cols=sorted(missing_cols)))
Exemple #9
0
    def __init__(
        self,
        filepath: str,
    ):
        try:
            import pdfplumber
        except ModuleNotFoundError:
            raise DataSetError(
                "PDFDataSet requires pdfplumber to be installed.")

        self._filepath = filepath
    def _load(self) -> bytes:
        import zipfile
        with zipfile.ZipFile(self._filepath) as zipped:
            namelist = zipped.namelist()
            if self._zipped_filename_suffix is not None:
                namelist = [
                    name for name in namelist
                    if name.lower().endswith(self._zipped_filename_suffix)
                ]
            namelist = [
                name for name in namelist if not self._is_ignored(name)
            ]
            if len(namelist) > 1 and self._zipped_filename is None:
                raise DataSetError(
                    f'Multiple files found! Please specify which file to extract: {namelist}'
                )
            if len(namelist) <= 0:
                raise DataSetError(f'No files found in the archive!')

            target_filename = namelist[0]
            if self._zipped_filename is not None:
                target_filename = self._zipped_filename

            with zipped.open(target_filename,
                             pwd=self._password) as zipped_file:
                temp_unzipped_dir = tempfile.mkdtemp()
                temp_unzipped_filepath = os.path.join(temp_unzipped_dir,
                                                      "temp_file")
                with open(temp_unzipped_filepath, "wb") as temp_unzipped_file:
                    temp_unzipped_file.write(zipped_file.read())

                kwargs = deepcopy(self._dataset_config)
                kwargs[self._filepath_arg] = temp_unzipped_filepath
                dataset = self._dataset_type(**kwargs)
                data = dataset.load()
                os.remove(temp_unzipped_filepath)
                return data
Exemple #11
0
 def save(self, data: Any):
     """Calls save method of a shared MemoryDataSet in SyncManager."""
     try:
         self.shared_memory_dataset.save(data)
     except Exception as exc:  # pylint: disable=broad-except
         # Checks if the error is due to serialisation or not
         try:
             pickle.dumps(data)
         except Exception as exc:  # SKIP_IF_NO_SPARK
             raise DataSetError(
                 f"{str(data.__class__)} cannot be serialized. ParallelRunner "
                 "implicit memory datasets can only be used with serializable data"
             ) from exc
         else:
             raise exc
    def run_id(self):
        """Get run id.

        If active run is not found, tries to find last experiment.

        Raise `DataSetError` exception if run id can't be found.

        Returns:
            str: String contains run_id.
        """
        if self._run_id is not None:
            return self._run_id
        run = mlflow.active_run()
        if run:
            return run.info.run_id
        raise DataSetError("Cannot find run id.")
Exemple #13
0
    def _load(self) -> List[PDFPage]:
        """
        Loads a list of PDFPage objects, with each index corresponding
        to the particular page that was loaded.
        :return:
        """
        try:
            import pdfplumber
        except ModuleNotFoundError:
            raise DataSetError(
                "PDFDataSet requires pdfplumber to be installed.")

        with pdfplumber.open(self._filepath) as pdf:
            pages = []
            for page in range(len(pdf.pages)):
                first_page = pdf.pages[page]
                tbl = first_page.extract_tables()
                txt = first_page.extract_text()
                pages.append(PDFPage(txt, tbl))
            return pages
    def __init__(
        self,
        table_name: str,
        view: str = '',
        credentials: Dict[str, str] = None,
    ):
        """

        :param table_name:
        :param credentials:
            base_id:
            api_key:
        """
        self._table_name = table_name
        self._view = view
        if credentials is None or 'api_key' not in credentials or 'base_id' not in credentials:
            raise DataSetError(
                'Credentials must be passed with "api_key" and "base_id" keys')
        self._api_key = credentials['api_key']
        self._base_id = credentials['base_id']
Exemple #15
0
    def _load(self) -> List[GitFileCommit]:
        try:
            git_command = [
                "git",
                "-C",
                self._filepath,
                "log",
                '--pretty=format:commit:%H,%cI,"%an",%ae,"%s"',
                "--numstat",
            ]

            if type(self._before) is datetime.date:
                git_command.append(f"--before={self._before}")
            if type(self._after) is datetime.date:
                git_command.append(f"--after={self._after}")

            raw_git_output = subprocess.check_output(git_command).decode("UTF8")
        except subprocess.CalledProcessError as e:
            raise DataSetError(e)

        return _parse_git_log_output(raw_git_output)
    def __init__(
        self,
        filepath: str,
        zipped_filename: str = None,
        zipped_filename_suffix: str = None,
        ignored_prefixes: str = None,
        ignored_suffixes: str = None,
        credentials: Dict[str, str] = None,
        dataset: Optional[Union[str, Type[AbstractDataSet], Dict[str,
                                                                 Any]]] = None,
        filepath_arg: str = 'filepath',
    ):

        if dataset is None:
            dataset = ZipFileDataSet.DEFAULT_DATASET

        dataset = dataset if isinstance(dataset, dict) else {"type": dataset}
        self._dataset_type, self._dataset_config = parse_dataset_definition(
            dataset)
        if VERSION_KEY in self._dataset_config:
            raise DataSetError(
                "`{}` does not support versioning of the underlying dataset. "
                "Please remove `{}` flag from the dataset definition.".format(
                    self.__class__.__name__, VERSIONED_FLAG_KEY))

        self._filepath_arg = filepath_arg
        if self._filepath_arg in self._dataset_config:
            warn(
                "`{}` key must not be specified in the dataset definition as it "
                "will be overwritten by partition path".format(
                    self._filepath_arg))

        self._filepath = filepath
        self._zipped_filename = zipped_filename
        self._zipped_filename_suffix = zipped_filename_suffix
        self._ignored_prefixes = ignored_prefixes or ['_', '.']
        self._ignored_suffixes = (ignored_suffixes or []) + ['/']
        credentials = credentials or {}
        self._password = credentials.get('password', credentials.get('pwd'))
 def _load(self) -> str:
     raise DataSetError("Loading not supported for MatplotlibWriter")
Exemple #18
0
 def _load(self) -> None:
     raise DataSetError("Loading not supported for `{}`".format(
         self.__class__.__name__))
Exemple #19
0
 def _save(self, data: Any) -> None:
     raise DataSetError(f'Saving is unsupported')
Exemple #20
0
 def _get_table(slide: Slide) -> Table:
     for shape in slide.shapes:
         if shape.has_table:
             return shape.table
     raise DataSetError("slide does not contain a table")
Exemple #21
0
 def _save(self, data: Any) -> None:
     raise DataSetError('Save Unsupported')