def process_json(self): # schema to parse both Q and T type records common_event = StructType() \ .add("trade_dt",DateType(),True) \ .add("rec_type",StringType(),True) \ .add("symbol",StringType(),True) \ .add("exchange",StringType(),True) \ .add("event_tm",TimestampType(),True) \ .add("event_seq_nb",IntegerType(),True) \ .add("arrival_tm",TimestampType(),True) \ .add("trade_pr",DecimalType(17,14),True) \ .add("bid_pr",DecimalType(17,14),True) \ .add("bid_size",IntegerType(),True) \ .add("ask_pr",DecimalType(17,14),True) \ .add("ask_size",IntegerType(),True) \ .add("partition",StringType(),True) #spark = SparkSession.builder.master('local').\ # appName('app').getOrCreate() raw = self.spark.sparkContext.textFile(self._load_path + "/json/*/NASDAQ/*.txt") parse_json = self.parse_json parsed = raw.map(lambda line: parse_json(line)) data_json = self.spark.createDataFrame(parsed, schema=common_event) data_json.show(truncate=False) # Save the final dataframe as parquet files in partitions data_json.write.partitionBy("partition").mode("append").parquet( self._save_path)
def schema(self): return StructType(fields=[ StructField(name='date', dataType=StringType(), nullable=False), StructField(name='revenue', dataType=DecimalType(17,2), nullable=False), StructField(name='new_revenue_percentage', dataType=DecimalType(5,2), nullable=False), StructField(name='revenue_from_new_customers', dataType=DecimalType(17,2), nullable=False) ])
def test_decimal_type(self): t1 = DecimalType() t2 = DecimalType(10, 2) self.assertTrue(t2 is not t1) self.assertNotEqual(t1, t2) t3 = DecimalType(8) self.assertNotEqual(t2, t3)
def process_csv(self): # schema to parse both Q and T type records common_event = StructType() \ .add("trade_dt",DateType(),True) \ .add("rec_type",StringType(),True) \ .add("symbol",StringType(),True) \ .add("exchange",StringType(),True) \ .add("event_tm",TimestampType(),True) \ .add("event_seq_nb",IntegerType(),True) \ .add("arrival_tm",TimestampType(),True) \ .add("trade_pr",DecimalType(17,14),True) \ .add("bid_pr",DecimalType(17,14),True) \ .add("bid_size",IntegerType(),True) \ .add("ask_pr",DecimalType(17,14),True) \ .add("ask_size",IntegerType(),True) \ .add("partition",StringType(),True) spark = SparkSession.builder.master('local').\ appName('app').getOrCreate() raw = spark.sparkContext.\ textFile(self._load_path+"/csv/*/NYSE/*.txt") # Parse the text file and parse using the parse_csv function to get the rdd in proper format. parsed = raw.map(lambda line: self.parse_csv(line)) data_csv = spark.createDataFrame(parsed, schema=common_event) data_csv.show(truncate=False) # Save the final dataframe as parquet files in partitions data_csv.write.partitionBy("partition").mode("append").parquet( self._save_path)
def revenue(spark_session): schema = StructType(fields=[ StructField(name='date', dataType=StringType(), nullable=False), StructField( name='revenue', dataType=DecimalType(17, 2), nullable=False), StructField(name='new_revenue_percentage', dataType=DecimalType(5, 2), nullable=False), StructField(name='revenue_from_new_customers', dataType=DecimalType(17, 2), nullable=False) ]) R = namedtuple('R', [ 'date', 'revenue', 'new_revenue_percentage', 'revenue_from_new_customers' ]) return spark_session.createDataFrame(data=[ R(date='2020-06', revenue=Decimal('121.00'), new_revenue_percentage=Decimal('0.00'), revenue_from_new_customers=Decimal('0.00')), R(date='2020-05', revenue=Decimal('123.66'), new_revenue_percentage=Decimal('33.82'), revenue_from_new_customers=Decimal('41.82')), R(date='2020-04', revenue=Decimal('40.80'), new_revenue_percentage=Decimal('100.00'), revenue_from_new_customers=Decimal('40.80')) ], schema=schema)
def test_parse_datatype_string(self): from pyspark.sql.types import _all_atomic_types, _parse_datatype_string for k, t in _all_atomic_types.items(): if t != NullType: self.assertEqual(t(), _parse_datatype_string(k)) self.assertEqual(IntegerType(), _parse_datatype_string("int")) self.assertEqual(DecimalType(1, 1), _parse_datatype_string("decimal(1 ,1)")) self.assertEqual(DecimalType(10, 1), _parse_datatype_string("decimal( 10,1 )")) self.assertEqual(DecimalType(11, 1), _parse_datatype_string("decimal(11,1)")) self.assertEqual(ArrayType(IntegerType()), _parse_datatype_string("array<int >")) self.assertEqual(MapType(IntegerType(), DoubleType()), _parse_datatype_string("map< int, double >")) self.assertEqual( StructType([ StructField("a", IntegerType()), StructField("c", DoubleType()) ]), _parse_datatype_string("struct<a:int, c:double >")) self.assertEqual( StructType([ StructField("a", IntegerType()), StructField("c", DoubleType()) ]), _parse_datatype_string("a:int, c:double")) self.assertEqual( StructType([ StructField("a", IntegerType()), StructField("c", DoubleType()) ]), _parse_datatype_string("a INT, c DOUBLE"))
def computaBalances(sc, sqlContext, unionDFBalances, df_FondoCuenta, DictBalances): #ListaDfBalances = DictBalances["df"] Lista_N_GES = DictBalances["N_GES"] dfsBalances = creaDFSparkBalances(sqlContext, Lista_N_GES, DictBalances, df_FondoCuenta) # columns_to_dropUnionBalances = ['EXIS CED', 'DIFERENCIA'] unionDFBalances = unionDFBalances.drop(*columns_to_dropUnionBalances) unionDFBalances = unionDFBalances.join(dfsBalances, on='N_GES', how='inner') def eligeEnTabla(epigrafe, col1, col2, col3, col4): if epigrafe == "Inversión normal": return col1 if epigrafe == "Contencioso": return col2 if epigrafe == "Amortizado": return col3 if epigrafe == "Impagado": return col4 udfeligeEnTabla = udf(eligeEnTabla, DecimalType(15, 3)) unionDFBalances= unionDFBalances.withColumn("CONT GES", udfeligeEnTabla( "Epígrafe" ,\ "InvNorm" ,\ "Content" ,\ "Amortiz" ,\ "Impagado" )) def calculaDif(exisges, partced): return exisges + partced udfcalculaDif = udf(calculaDif, DecimalType(15, 3)) unionDFBalances= unionDFBalances.withColumn("DIFERENCIA", udfcalculaDif( "EXIS GES" ,\ "CONT GES" )) unionDFBalances = unionDFBalances.select([ "Fecha CED" ,\ "Fecha GES" ,\ "N_GES" ,\ "Nombre_abreviado" ,\ "Epígrafe" ,\ "EXIS GES" ,\ "CONT GES" ,\ "DIFERENCIA"] ) # Se borran las filas que tengan amortizado y impagado unionDFBalances = unionDFBalances.filter( unionDFBalances["Epígrafe"] != 'Amortizado' )\ .filter( unionDFBalances["Epígrafe"] != 'Impagado' ) return unionDFBalances
def load_events_schema(self): self.events_schema = StructType(fields=[ StructField('type', IntegerType(), False), StructField('amount', DecimalType(), False), StructField('oldbalanceOrg', DecimalType(), False), StructField('newbalanceOrig', DecimalType(), False), StructField('oldbalanceDest', DecimalType(), False), StructField('newbalanceDest', DecimalType(), False) ])
def test_treats_money_with_appropriate_precision(self): actual_schema = self.test_processor._clean_movies( self.test_movies).select('budget', 'revenue').schema expected_schema = StructType([ StructField('budget', DecimalType(15, 4), True), StructField('revenue', DecimalType(15, 4), True), ]) self.assertEqual(actual_schema, expected_schema)
def get_dataframe(filename='data/GBPUSD-2018-12-tick.csv'): # open high low close schema = StructType([ StructField("symbol", StringType(), True), StructField("time", StringType(), True), StructField("bid", DecimalType(precision=10, scale=5), True), StructField("ask", DecimalType(precision=10, scale=5), True) ]) # df = spark.read.csv('GBPUSD-2016-11-tick.csv', header='False', schema=schema) df = sqlContext.read.csv(filename, header='False', schema=schema) return df
def test_convert_strings_to_types(spark_session: SparkSession): df = spark_session.createDataFrame([ ("1", [1, 2, 3], "19-02-2020", "19-02-2020 00:00:00", "true", "1", "0.5", "123534627458685341", "123534627458685341"), ("2", [1, 2, 3], "19-02-2020", "19-02-2020 00:00:00", "false", "-1", "0.7891412462571346431", "234735684679046827457", "234735684679046827457"), ("3", [1, 2, 3], "19-02-2020", "19-02-2020 00:00:00", "True", "0", PI_STRING, PI_STRING, PI_STRING), ("4", [1, 2, 3], "19-02-2020", "19-02-2020 00:00:00", "not_true", "2147483648", "42", "-143763583573461346.0368479672", "-143763583573461346.0368479672"), ("5", [1, 2, 3], None, None, None, None, None, None, None), ], [ "id", "array", "date", "timestamp", "boolean", "integer", "double", "decimal(38,0)", "decimal(24,5)" ]) transformer = StringTypeTransformer() expected_schema = StructType([ StructField("id", StringType()), StructField("array", ArrayType(LongType())), StructField("date", DateType()), StructField("timestamp", TimestampType()), StructField("boolean", BooleanType(), nullable=False), StructField("integer", IntegerType()), StructField("double", DoubleType()), StructField("decimal(38,0)", DecimalType(38, 0)), StructField("decimal(24,5)", DecimalType(24, 5)) ]) transformed = transformer.transform_dataframe(df, expected_schema) assert transformed.schema == expected_schema """ I guess there couldn't be a dataframe with different set of columns for each row. Therefore, this assertion is wrong. """ assert transformed.toJSON().collect() == [ '{"id":"1","array":[1,2,3],"date":"2020-02-19","timestamp":"2020-02-19T00:00:00.000+03:00","boolean":true,' '"integer":1,"double":0.5,"decimal(38,0)":123534627458685341,"decimal(24,5)":123534627458685341.00000}', '{"id":"2","array":[1,2,3],"date":"2020-02-19","timestamp":"2020-02-19T00:00:00.000+03:00","boolean":false,' '"integer":-1,"double":0.7891412462571347,"decimal(38,0)":234735684679046827457}', '{"id":"3","array":[1,2,3],"date":"2020-02-19","timestamp":"2020-02-19T00:00:00.000+03:00","boolean":true,' '"integer":0,"double":3.141592653589793,"decimal(38,0)":3,"decimal(24,5)":3.14159}', '{"id":"4","array":[1,2,3],"date":"2020-02-19","timestamp":"2020-02-19T00:00:00.000+03:00","boolean":false,' '"double":42.0,"decimal(38,0)":-143763583573461346,"decimal(24,5)":-143763583573461346.03685}', '{"id":"5","array":[1,2,3],"boolean":false}' ]
def creaDFSparkBalances(sqlContext, Lista_N_GES, DictBalances, df_FondoCuenta): Datos = [] i = 0 # Se comprueba que en las pestanas solo hayan valores numericos Lista_N_GES = list(filter(lambda x: x.isnumeric(), Lista_N_GES)) for N_GES in Lista_N_GES: N_GESnumber = int(str(N_GES)) if str(N_GESnumber) in df_FondoCuenta["N_GES"].to_list(): dato1 = sumaInverNormalBalance(N_GESnumber, DictBalances["df"][i], df_FondoCuenta) dato2 = sumaContensioso(N_GESnumber, DictBalances["df"][i], df_FondoCuenta) else: print( "El fondo: ", N_GES, "captado del fichero Balances no se encuentra en fondo balances" ) dato1 = 0.0 dato2 = 0.0 dato3 = 0.0 dato4 = 0.0 dato0 = N_GESnumber Datos.append([dato0, dato1, dato2, dato3, dato4]) i = i + 1 # Crea DataFrame de spark para DictBalance field = [StructField( "N_GES" , IntegerType(), True ),\ StructField( "InvNorm" , StringType (), True ),\ StructField( "Content" , StringType (), True ),\ StructField( "Amortiz" , StringType (), True ),\ StructField( "Impagado", StringType (), True ),] dfoutput = sqlContext.createDataFrame(Datos, StructType(field)) dfoutput = dfoutput.withColumn("N_GES" , dfoutput["N_GES" ].cast( IntegerType ( ) ) )\ .withColumn("InvNorm" , dfoutput["InvNorm" ].cast( DecimalType (15,3) ) )\ .withColumn("Content" , dfoutput["Content" ].cast( DecimalType (15,3) ) )\ .withColumn("Amortiz" , dfoutput["Amortiz" ].cast( DecimalType (15,3) ) )\ .withColumn("Impagado" , dfoutput["Impagado" ].cast( DecimalType (15,3) ) ) return dfoutput
def passed_temperature_analyse(filename): print("begin to analyse passed temperature") spark = SparkSession.builder.master("local").appName("passed_temperature_analyse").getOrCreate() df = spark.read.csv(filename, header=True) df_temperature = df.select( # 选择需要的列 df['province'], df['city_name'], df['city_code'], df['temperature'].cast(DecimalType(scale=1)), F.date_format(df['time'], "yyyy-MM-dd").alias("date"), # 得到日期数据 F.hour(df['time']).alias("hour") # 得到小时数据 ) # 筛选四点时次 df_4point_temperature = df_temperature.filter(df_temperature['hour'].isin([2, 8, 12, 20])) # df_4point_temperature.printSchema() df_avg_temperature = df_4point_temperature.groupBy("province", "city_name", "city_code", "date") \ .agg(F.count("temperature"), F.avg("temperature").alias("avg_temperature")) \ .filter("count(temperature) = 4") \ .sort(F.asc("avg_temperature")) \ .select("province", "city_name", "city_code", "date", F.format_number('avg_temperature', 1).alias("avg_temperature")) df_avg_temperature.cache() avg_temperature_list = df_avg_temperature.collect() df_avg_temperature.coalesce(1).write.json("file:///F:/Code_All/Jupyter_Code/spark_test/result_data/bigData/passed_rain_temperature.json") print("end analysing passed temperature") return avg_temperature_list[0:10]
def get_avg_time_per_difficulty(self, df, easy_threshold, medium_threshold, hard_threshold): """ calculate average cooking time duration per difficulty level :param df: DataFrame to be process :param easy_threshold: Integer easy threshold :param medium_threshold: Integer medium threshold :param hard_threshold: Integer hard threshold :return: transformed DataFrame """ df_filtered_min = df.withColumn('cookTime_in_Min', tominutes(col('cookTime')).cast(IntegerType())) \ .withColumn('prepTime_in_Min', tominutes(col('prepTime')).cast(IntegerType())) df_filtered_with_time = df_filtered_min. \ withColumn("total_time", (col("cookTime_in_Min") + col("prepTime_in_Min")).cast( IntegerType())). \ withColumn("difficulty", when(col("total_time") < easy_threshold, 'easy'). when((col("total_time") >= easy_threshold) & (col("total_time") <= medium_threshold), 'medium'). when(col("total_time") > hard_threshold, 'hard') .cast(StringType())) result_df = df_filtered_with_time.groupBy(col('difficulty')). \ agg(avg(col('total_time')).cast(DecimalType(38, 2)).alias("avg_total_cooking_time")) return result_df
def _confidence_degree_agg(flat_table: FlatTable, **kwargs) -> PDDataFrame: if kwargs.get("group_by_cols", None): group_by_cols = kwargs.get("group_by_cols") elif flat_table.name == "MCO": group_by_cols = frozenset(["ETA_NUM", "RSA_NUM"]) elif flat_table.name == "MCO_CE": group_by_cols = frozenset(["ETA_NUM", "SEQ_NUM"]) else: group_by_cols = None col = when(max("count") != 0, min("count") / max("count") * 100).otherwise(0) df = (reduce( lambda a, b: a.union(b), [ _union_not_null_cols_count( flat_table, flat_table.single_tables[single_table_name], nick_name, col_name, group_by_cols, ) for single_table_name, cols in _CNAM_COLS_MAPPING[ flat_table.name].items() for nick_name, col_name in cols.items() ], ).groupBy("ColName").agg( col.cast(DecimalType(32, 2)).alias("ConfidenceDegree"))) return df.toPandas()
def test_data_type_ops(self): _mock_spark_type = DataType() _mock_dtype = ExtensionDtype() _mappings = ( (CategoricalDtype(), _mock_spark_type, CategoricalOps), (_mock_dtype, DecimalType(), DecimalOps), (_mock_dtype, FractionalType(), FractionalOps), (_mock_dtype, IntegralType(), IntegralOps), (_mock_dtype, StringType(), StringOps), (_mock_dtype, BooleanType(), BooleanOps), (_mock_dtype, TimestampType(), DatetimeOps), (_mock_dtype, TimestampNTZType(), DatetimeNTZOps), (_mock_dtype, DateType(), DateOps), (_mock_dtype, DayTimeIntervalType(), TimedeltaOps), (_mock_dtype, BinaryType(), BinaryOps), (_mock_dtype, ArrayType(StringType()), ArrayOps), (_mock_dtype, MapType(StringType(), IntegralType()), MapOps), (_mock_dtype, StructType(), StructOps), (_mock_dtype, NullType(), NullOps), (_mock_dtype, UserDefinedType(), UDTOps), ) for _dtype, _spark_type, _ops in _mappings: self.assertIsInstance(DataTypeOps(_dtype, _spark_type), _ops) _unknow_spark_type = _mock_spark_type self.assertRaises(TypeError, DataTypeOps, BooleanType(), _unknow_spark_type)
def CambiaColumnas_A_Double(DF, ListaColumnas): #funciona # Se cambia el tipo de dato de las columnas elegidas a float for Columna in ListaColumnas: DF_Nuevo = DF.withColumn(Columna, DF[Columna].cast(DecimalType(15, 3))) # print(" Conseguido cambiar a double\n") return DF_Nuevo
def getPercentage(DataSet, Column): Total = DataSet.count() Stats = getFreq(DataSet, Column) \ .withColumn(PERCENT_STAT, \ F.expr("(" + FREQ_STAT + " * 100)/" + str(Total)) \ .cast(DecimalType(10,2))) return Stats
class DataType(Enum): """Holds constants for data types within Butterfree.""" TIMESTAMP = (TimestampType(), "timestamp", "TIMESTAMP") BINARY = (BinaryType(), "boolean", "BINARY") BOOLEAN = (BooleanType(), "boolean", "BOOLEAN") DATE = (DateType(), "timestamp", "DATE") DECIMAL = (DecimalType(), "decimal", "DECIMAL") DOUBLE = (DoubleType(), "double", "DOUBLE") FLOAT = (FloatType(), "float", "FLOAT") INTEGER = (IntegerType(), "int", "INT") BIGINT = (LongType(), "bigint", "BIGINT") STRING = (StringType(), "text", "STRING") ARRAY_BIGINT = (ArrayType(LongType()), "frozen<list<bigint>>", "ARRAY<BIGINT>") ARRAY_STRING = (ArrayType(StringType()), "frozen<list<text>>", "ARRAY<STRING>") ARRAY_FLOAT = (ArrayType(FloatType()), "frozen<list<float>>", "ARRAY<FLOAT>") def __init__(self, spark: PySparkDataType, cassandra: str, spark_sql: str) -> None: self.spark = spark self.cassandra = cassandra self.spark_sql = spark_sql
def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column], parent_columns: Optional[List[Column]], ) -> Column: if ( source_df is not None and isinstance(self.value, AutoMapperDataTypeColumn) and "decimal" not in dict(source_df.dtypes)[self.value.value] ): # parse the amount here column_spec = self.value.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ).cast(DecimalType(precision=self.precision, scale=self.scale)) return column_spec else: # Already a decimal column_spec = self.value.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ) return column_spec
def AnadeColumnaSpark(SQLContext, DFInput, NombreColumna, ListaAInsertar, cambioADouble): # Se anade una columna a un df de spark a partir de una lista y poniendole # un nombre a la columna #Se crea el dataframe de la Lista a Insertar DFListaAInsertar = SQLContext.createDataFrame([(l, ) for l in ListaAInsertar], [NombreColumna]) #add 'sequential' index and join both dataframe to get the final result DFInput = DFInput.withColumn( "row_idx", row_number().over(Window.orderBy(monotonically_increasing_id()))) DFListaAInsertar = DFListaAInsertar.withColumn( "row_idx", row_number().over(Window.orderBy(monotonically_increasing_id()))) DFSalida = DFInput.join( DFListaAInsertar, DFInput.row_idx == DFListaAInsertar.row_idx).drop("row_idx") if cambioADouble == True: DFSalida = DFSalida.withColumn( NombreColumna, DFSalida[NombreColumna].cast(DecimalType(15, 3))) return DFSalida
def test_returns_correct_schema(self): actual_schema = self.test_processor.all_movies.schema expected_schema = StructType([ StructField('title', StringType(), True), StructField('production_companies', ArrayType(StringType(), True), True), StructField('release_date', DateType(), True), StructField('rating', DecimalType(10, 6), True), StructField('revenue_budget_ratio', DecimalType(8, 2), True), StructField('budget', DecimalType(15, 4), True), StructField('revenue', DecimalType(15, 4), True), StructField('year', IntegerType(), True), StructField('movie_id', StringType(), False) ]) self.assertEqual(actual_schema, expected_schema)
def test_make_named_tuple(): TestSchema = Unischema('TestSchema', [ UnischemaField('string_scalar', np.string_, (), ScalarCodec(StringType()), True), UnischemaField('int32_scalar', np.int32, (), ScalarCodec(ShortType()), False), UnischemaField('uint8_scalar', np.uint8, (), ScalarCodec(ShortType()), False), UnischemaField('int32_matrix', np.float32, (10, 20, 3), NdarrayCodec(), True), UnischemaField('decimal_scalar', Decimal, (10, 20, 3), ScalarCodec(DecimalType(10, 9)), False), ]) TestSchema.make_namedtuple(string_scalar='abc', int32_scalar=10, uint8_scalar=20, int32_matrix=np.int32((10, 20, 3)), decimal_scalar=Decimal(123) / Decimal(10)) TestSchema.make_namedtuple(string_scalar=None, int32_scalar=10, uint8_scalar=20, int32_matrix=None, decimal_scalar=Decimal(123) / Decimal(10))
def setUpClass(cls): from datetime import date, datetime from decimal import Decimal super(ArrowTests, cls).setUpClass() cls.warnings_lock = threading.Lock() # Synchronize default timezone between Python and Java cls.tz_prev = os.environ.get("TZ", None) # save current tz if set tz = "America/Los_Angeles" os.environ["TZ"] = tz time.tzset() cls.spark.conf.set("spark.sql.session.timeZone", tz) # Test fallback cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "false" cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false" # Enable Arrow optimization in this tests. cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") # Disable fallback by default to easily detect the failures. cls.spark.conf.set( "spark.sql.execution.arrow.pyspark.fallback.enabled", "false") cls.schema_wo_null = StructType([ StructField("1_str_t", StringType(), True), StructField("2_int_t", IntegerType(), True), StructField("3_long_t", LongType(), True), StructField("4_float_t", FloatType(), True), StructField("5_double_t", DoubleType(), True), StructField("6_decimal_t", DecimalType(38, 18), True), StructField("7_date_t", DateType(), True), StructField("8_timestamp_t", TimestampType(), True), StructField("9_binary_t", BinaryType(), True) ]) cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True) cls.data_wo_null = [ (u"a", 1, 10, 0.2, 2.0, Decimal("2.0"), date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1), bytearray(b"a")), (u"b", 2, 20, 0.4, 4.0, Decimal("4.0"), date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2), bytearray(b"bb")), (u"c", 3, 30, 0.8, 6.0, Decimal("6.0"), date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3), bytearray(b"ccc")), (u"d", 4, 40, 1.0, 8.0, Decimal("8.0"), date(2262, 4, 12), datetime(2262, 3, 3, 3, 3, 3), bytearray(b"dddd")), ] cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
def _spark_replace_decimal_fields(dataframe): # list all fields and types from dataframe df_columns = {} for field in dataframe.schema.fields: df_columns.update({field.name: field.dataType}) new_dataframe = dataframe decimal_types = (DecimalType(38, 10), DecimalType(38, 0)) for column, data_type in df_columns.items(): # decimal type is transformed to double type if isinstance(data_type, DecimalType) and data_type not in decimal_types: new_dataframe = new_dataframe.withColumn( column, new_dataframe[column].cast(DoubleType())) return new_dataframe
def get_users_schema(self): schema = StructType([ StructField("id", IntegerType(), False), #_C0 StructField("login", StringType(), True), #_c1 StructField("company", StringType(), True), #_c2 StructField("created_at", TimestampType(), True), #_c3 StructField("user_type", StringType(), True), #_c4 StructField("fake", IntegerType(), True), #_c5 StructField("deleted", IntegerType(), True), #_c6 StructField("long", DecimalType(), True), #_c7 StructField("lat", DecimalType(), True), #_c8 StructField("country_code", StringType(), True), StructField("state", StringType(), True), StructField("city", StringType(), True), #_c9 StructField("location", StringType(), True) ]) return schema
def get_song_schema(): """ Create a schema to use for the song data. :return: StructType object """ schema = StructType([ StructField('artist_id', StringType(), True), StructField('artist_latitude', DecimalType(), True), StructField('artist_longitude', DecimalType(), True), StructField('artist_location', StringType(), True), StructField('artist_name', StringType(), True), StructField('duration', DoubleType(), True), StructField('num_songs', IntegerType(), True), StructField('song_id', StringType(), True), StructField('title', StringType(), True), StructField('year', IntegerType(), True) ]) return schema
def test_negative_decimal(self): try: self.spark.sql("set spark.sql.legacy.allowNegativeScaleOfDecimal=true") df = self.spark.createDataFrame([(1, ), (11, )], ["value"]) ret = df.select(col("value").cast(DecimalType(1, -1))).collect() actual = list(map(lambda r: int(r.value), ret)) self.assertEqual(actual, [0, 10]) finally: self.spark.sql("set spark.sql.legacy.allowNegativeScaleOfDecimal=false")
def check_column_numeric(df, column): return df.schema[column].dataType in [ IntegerType(), ShortType(), LongType(), FloatType(), DecimalType(), DoubleType() ]
def from_arrow_schema(cls, parquet_dataset): """ Convert an apache arrow schema into a unischema object. This is useful for datasets of only scalars which need no special encoding/decoding. If there is an unsupported type in the arrow schema, it will throw an exception. :param arrow_schema: :class:`pyarrow.lib.Schema` :return: A :class:`Unischema` object. """ meta = parquet_dataset.pieces[0].get_metadata(parquet_dataset.fs.open) arrow_schema = meta.schema.to_arrow_schema() unischema_fields = [] for partition_name in parquet_dataset.partitions.partition_names: unischema_fields.append(UnischemaField(partition_name, np.str_, (), ScalarCodec(StringType()), False)) for column_name in arrow_schema.names: arrow_field = arrow_schema.field_by_name(column_name) field_type = arrow_field.type if field_type == pyarrow.int8(): np_type = np.int8 codec = ScalarCodec(ByteType()) elif field_type == pyarrow.int16(): np_type = np.int16 codec = ScalarCodec(ShortType()) elif field_type == pyarrow.int32(): np_type = np.int32 codec = ScalarCodec(IntegerType()) elif field_type == pyarrow.int64(): np_type = np.int64 codec = ScalarCodec(LongType()) elif field_type == pyarrow.string(): np_type = np.unicode_ codec = ScalarCodec(StringType()) elif field_type == pyarrow.bool_(): np_type = np.bool_ codec = ScalarCodec(BooleanType()) elif field_type == pyarrow.float32(): np_type = np.float32 codec = ScalarCodec(FloatType()) elif field_type == pyarrow.float64(): np_type = np.float64 codec = ScalarCodec(DoubleType()) elif isinstance(field_type, pyarrow.lib.Decimal128Type): np_type = Decimal codec = ScalarCodec(DecimalType(field_type.precision, field_type.scale)) elif field_type == pyarrow.binary(): np_type = np.string_ codec = ScalarCodec(StringType()) elif isinstance(field_type, pyarrow.lib.FixedSizeBinaryType): np_type = np.string_ codec = ScalarCodec(StringType()) else: raise ValueError('Cannot auto-create unischema due to unsupported column type {}'.format(field_type)) unischema_fields.append(UnischemaField(column_name, np_type, (), codec, arrow_field.nullable)) return Unischema('inferred_schema', unischema_fields)