def test_exists_raises_error(self, mocker): delta_ds = DeltaTableDataSet(filepath="") mocker.patch.object(delta_ds, "_get_spark", side_effect=AnalysisException( "Other Exception", [])) with pytest.raises(DataSetError, match="Other Exception"): delta_ds.exists()
def test_exists_raises_error(self, mocker): # exists should raise all errors except for # AnalysisExceptions clearly indicating a missing file spark_data_set = SparkDataSet(filepath="") mocker.patch.object( spark_data_set, "_get_spark", side_effect=AnalysisException("Other Exception", []), ) with pytest.raises(DataSetError, match="Other Exception"): spark_data_set.exists()
def _convert_delta_exception(e: "JavaObject") -> Optional[CapturedException]: """ Convert Delta's Scala concurrent exceptions to the corresponding Python exceptions. """ s: str = e.toString() c: "JavaObject" = e.getCause() jvm: "JVMView" = SparkContext._jvm # type: ignore[attr-defined] gw = SparkContext._gateway # type: ignore[attr-defined] stacktrace = jvm.org.apache.spark.util.Utils.exceptionString(e) # Temporary workaround until Delta Lake is upgraded to Spark 3.3 # Below three exception handling cases are copied from # https://github.com/apache/spark/blob/master/python/pyspark/sql/utils.py#L156 if is_instance_of(gw, e, "org.apache.spark.sql.catalyst.parser.ParseException"): return ParseException(s.split(': ', 1)[1], stacktrace, c) # Order matters. ParseException inherits AnalysisException. if is_instance_of(gw, e, "org.apache.spark.sql.AnalysisException"): return AnalysisException(s.split(': ', 1)[1], stacktrace, c) if is_instance_of(gw, e, "java.lang.IllegalArgumentException"): return IllegalArgumentException(s.split(': ', 1)[1], stacktrace, c) if s.startswith( 'io.delta.exceptions.DeltaConcurrentModificationException: '): return DeltaConcurrentModificationException( s.split(': ', 1)[1], stacktrace, c) if s.startswith('io.delta.exceptions.ConcurrentWriteException: '): return ConcurrentWriteException(s.split(': ', 1)[1], stacktrace, c) if s.startswith('io.delta.exceptions.MetadataChangedException: '): return MetadataChangedException(s.split(': ', 1)[1], stacktrace, c) if s.startswith('io.delta.exceptions.ProtocolChangedException: '): return ProtocolChangedException(s.split(': ', 1)[1], stacktrace, c) if s.startswith('io.delta.exceptions.ConcurrentAppendException: '): return ConcurrentAppendException(s.split(': ', 1)[1], stacktrace, c) if s.startswith('io.delta.exceptions.ConcurrentDeleteReadException: '): return ConcurrentDeleteReadException( s.split(': ', 1)[1], stacktrace, c) if s.startswith('io.delta.exceptions.ConcurrentDeleteDeleteException: '): return ConcurrentDeleteDeleteException( s.split(': ', 1)[1], stacktrace, c) if s.startswith('io.delta.exceptions.ConcurrentTransactionException: '): return ConcurrentTransactionException( s.split(': ', 1)[1], stacktrace, c) return None
def get_data_frame_count_type_of_topic(data_frame: DataFrame) -> pb.DataFrame: """ From all the data, it takes the columns TopicID and Question and for each topic, count the number of+ different SubTopic/Question :param data_frame: generate with pyspark, and contain all the data from the csv file :return: data frame of panda package """ try: data_frame = data_frame \ .select("TopicID", "Question") \ .distinct() \ .groupBy("TopicID") \ .count() \ .sort("TopicID") except Py4JError: raise AnalysisException('One columns is incorrect') print("The following table represent the number of the type of each topic") data_frame.show() data_frame_pandas = data_frame.toPandas() return data_frame_pandas
def read_csv_with_data_frame(file_csv: str) -> DataFrame: """ Read CSV with as data frame with spark :param file_csv: file name of csv :return: all the data of the file as data frame """ spark_session = SparkSession \ .builder \ .getOrCreate() logger = spark_session._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.WARN) try: data_frame = spark_session\ .read\ .format("csv") \ .options(header='true', inferschema='true')\ .load(file_csv) except Py4JError: raise AnalysisException('There is no csv file in:' + str(os.path)) return data_frame
def faulty_get_spark(): raise AnalysisException("Other Exception", [])