def test_exists_raises_error(self, mocker):
        delta_ds = DeltaTableDataSet(filepath="")
        mocker.patch.object(delta_ds,
                            "_get_spark",
                            side_effect=AnalysisException(
                                "Other Exception", []))

        with pytest.raises(DataSetError, match="Other Exception"):
            delta_ds.exists()
Esempio n. 2
0
    def test_exists_raises_error(self, mocker):
        # exists should raise all errors except for
        # AnalysisExceptions clearly indicating a missing file
        spark_data_set = SparkDataSet(filepath="")
        mocker.patch.object(
            spark_data_set,
            "_get_spark",
            side_effect=AnalysisException("Other Exception", []),
        )

        with pytest.raises(DataSetError, match="Other Exception"):
            spark_data_set.exists()
Esempio n. 3
0
def _convert_delta_exception(e: "JavaObject") -> Optional[CapturedException]:
    """
    Convert Delta's Scala concurrent exceptions to the corresponding Python exceptions.
    """
    s: str = e.toString()
    c: "JavaObject" = e.getCause()

    jvm: "JVMView" = SparkContext._jvm  # type: ignore[attr-defined]
    gw = SparkContext._gateway  # type: ignore[attr-defined]
    stacktrace = jvm.org.apache.spark.util.Utils.exceptionString(e)

    # Temporary workaround until Delta Lake is upgraded to Spark 3.3
    # Below three exception handling cases are copied from
    # https://github.com/apache/spark/blob/master/python/pyspark/sql/utils.py#L156
    if is_instance_of(gw, e,
                      "org.apache.spark.sql.catalyst.parser.ParseException"):
        return ParseException(s.split(': ', 1)[1], stacktrace, c)
    # Order matters. ParseException inherits AnalysisException.
    if is_instance_of(gw, e, "org.apache.spark.sql.AnalysisException"):
        return AnalysisException(s.split(': ', 1)[1], stacktrace, c)
    if is_instance_of(gw, e, "java.lang.IllegalArgumentException"):
        return IllegalArgumentException(s.split(': ', 1)[1], stacktrace, c)

    if s.startswith(
            'io.delta.exceptions.DeltaConcurrentModificationException: '):
        return DeltaConcurrentModificationException(
            s.split(': ', 1)[1], stacktrace, c)
    if s.startswith('io.delta.exceptions.ConcurrentWriteException: '):
        return ConcurrentWriteException(s.split(': ', 1)[1], stacktrace, c)
    if s.startswith('io.delta.exceptions.MetadataChangedException: '):
        return MetadataChangedException(s.split(': ', 1)[1], stacktrace, c)
    if s.startswith('io.delta.exceptions.ProtocolChangedException: '):
        return ProtocolChangedException(s.split(': ', 1)[1], stacktrace, c)
    if s.startswith('io.delta.exceptions.ConcurrentAppendException: '):
        return ConcurrentAppendException(s.split(': ', 1)[1], stacktrace, c)
    if s.startswith('io.delta.exceptions.ConcurrentDeleteReadException: '):
        return ConcurrentDeleteReadException(
            s.split(': ', 1)[1], stacktrace, c)
    if s.startswith('io.delta.exceptions.ConcurrentDeleteDeleteException: '):
        return ConcurrentDeleteDeleteException(
            s.split(': ', 1)[1], stacktrace, c)
    if s.startswith('io.delta.exceptions.ConcurrentTransactionException: '):
        return ConcurrentTransactionException(
            s.split(': ', 1)[1], stacktrace, c)
    return None
Esempio n. 4
0
def get_data_frame_count_type_of_topic(data_frame: DataFrame) -> pb.DataFrame:
    """
    From all the data, it takes the columns TopicID and Question and for each topic, count the number of+
    different SubTopic/Question
    :param data_frame: generate with pyspark, and contain all the data from the csv file
    :return: data frame of panda package
    """
    try:
        data_frame = data_frame \
            .select("TopicID", "Question") \
            .distinct() \
            .groupBy("TopicID") \
            .count() \
            .sort("TopicID")
    except Py4JError:
        raise AnalysisException('One columns is incorrect')
    print("The following table represent the number of the type of each topic")
    data_frame.show()
    data_frame_pandas = data_frame.toPandas()
    return data_frame_pandas
Esempio n. 5
0
def read_csv_with_data_frame(file_csv: str) -> DataFrame:
    """
    Read CSV with as data frame with spark
    :param file_csv: file name of csv
    :return: all the data of the file as data frame
    """
    spark_session = SparkSession \
        .builder \
        .getOrCreate()

    logger = spark_session._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.WARN)

    try:
        data_frame = spark_session\
            .read\
            .format("csv") \
            .options(header='true', inferschema='true')\
            .load(file_csv)
    except Py4JError:
        raise AnalysisException('There is no csv file in:'  + str(os.path))

    return data_frame
Esempio n. 6
0
 def faulty_get_spark():
     raise AnalysisException("Other Exception", [])