コード例 #1
0
    def process_json(self):

        # schema to parse both Q and T type records
        common_event = StructType() \
                    .add("trade_dt",DateType(),True) \
                    .add("rec_type",StringType(),True) \
                    .add("symbol",StringType(),True) \
                    .add("exchange",StringType(),True) \
                    .add("event_tm",TimestampType(),True) \
                    .add("event_seq_nb",IntegerType(),True) \
                    .add("arrival_tm",TimestampType(),True) \
                    .add("trade_pr",DecimalType(17,14),True) \
                    .add("bid_pr",DecimalType(17,14),True) \
                    .add("bid_size",IntegerType(),True) \
                    .add("ask_pr",DecimalType(17,14),True) \
                    .add("ask_size",IntegerType(),True) \
                    .add("partition",StringType(),True)
        #spark = SparkSession.builder.master('local').\
        #       appName('app').getOrCreate()

        raw = self.spark.sparkContext.textFile(self._load_path +
                                               "/json/*/NASDAQ/*.txt")
        parse_json = self.parse_json
        parsed = raw.map(lambda line: parse_json(line))
        data_json = self.spark.createDataFrame(parsed, schema=common_event)
        data_json.show(truncate=False)

        # Save the final dataframe as parquet files in partitions
        data_json.write.partitionBy("partition").mode("append").parquet(
            self._save_path)
コード例 #2
0
ファイル: table_revenue.py プロジェクト: nickagel/revenue
 def schema(self):
     return StructType(fields=[
         StructField(name='date', dataType=StringType(), nullable=False),
         StructField(name='revenue', dataType=DecimalType(17,2), nullable=False),
         StructField(name='new_revenue_percentage', dataType=DecimalType(5,2), nullable=False),
         StructField(name='revenue_from_new_customers', dataType=DecimalType(17,2), nullable=False)
     ])
コード例 #3
0
ファイル: test_types.py プロジェクト: zwj0110/spark
 def test_decimal_type(self):
     t1 = DecimalType()
     t2 = DecimalType(10, 2)
     self.assertTrue(t2 is not t1)
     self.assertNotEqual(t1, t2)
     t3 = DecimalType(8)
     self.assertNotEqual(t2, t3)
コード例 #4
0
    def process_csv(self):

        # schema to parse both Q and T type records
        common_event = StructType() \
                    .add("trade_dt",DateType(),True) \
                    .add("rec_type",StringType(),True) \
                    .add("symbol",StringType(),True) \
                    .add("exchange",StringType(),True) \
                    .add("event_tm",TimestampType(),True) \
                    .add("event_seq_nb",IntegerType(),True) \
                    .add("arrival_tm",TimestampType(),True) \
                    .add("trade_pr",DecimalType(17,14),True) \
                    .add("bid_pr",DecimalType(17,14),True) \
                    .add("bid_size",IntegerType(),True) \
                    .add("ask_pr",DecimalType(17,14),True) \
                    .add("ask_size",IntegerType(),True) \
                    .add("partition",StringType(),True)

        spark = SparkSession.builder.master('local').\
                appName('app').getOrCreate()


        raw = spark.sparkContext.\
                textFile(self._load_path+"/csv/*/NYSE/*.txt")

        # Parse the text file and parse using the parse_csv function to get the rdd in proper format.
        parsed = raw.map(lambda line: self.parse_csv(line))
        data_csv = spark.createDataFrame(parsed, schema=common_event)

        data_csv.show(truncate=False)

        # Save the final dataframe as parquet files in partitions
        data_csv.write.partitionBy("partition").mode("append").parquet(
            self._save_path)
コード例 #5
0
 def revenue(spark_session):
     schema = StructType(fields=[
         StructField(name='date', dataType=StringType(), nullable=False),
         StructField(
             name='revenue', dataType=DecimalType(17, 2), nullable=False),
         StructField(name='new_revenue_percentage',
                     dataType=DecimalType(5, 2),
                     nullable=False),
         StructField(name='revenue_from_new_customers',
                     dataType=DecimalType(17, 2),
                     nullable=False)
     ])
     R = namedtuple('R', [
         'date', 'revenue', 'new_revenue_percentage',
         'revenue_from_new_customers'
     ])
     return spark_session.createDataFrame(data=[
         R(date='2020-06',
           revenue=Decimal('121.00'),
           new_revenue_percentage=Decimal('0.00'),
           revenue_from_new_customers=Decimal('0.00')),
         R(date='2020-05',
           revenue=Decimal('123.66'),
           new_revenue_percentage=Decimal('33.82'),
           revenue_from_new_customers=Decimal('41.82')),
         R(date='2020-04',
           revenue=Decimal('40.80'),
           new_revenue_percentage=Decimal('100.00'),
           revenue_from_new_customers=Decimal('40.80'))
     ],
                                          schema=schema)
コード例 #6
0
ファイル: test_types.py プロジェクト: zwj0110/spark
 def test_parse_datatype_string(self):
     from pyspark.sql.types import _all_atomic_types, _parse_datatype_string
     for k, t in _all_atomic_types.items():
         if t != NullType:
             self.assertEqual(t(), _parse_datatype_string(k))
     self.assertEqual(IntegerType(), _parse_datatype_string("int"))
     self.assertEqual(DecimalType(1, 1),
                      _parse_datatype_string("decimal(1  ,1)"))
     self.assertEqual(DecimalType(10, 1),
                      _parse_datatype_string("decimal( 10,1 )"))
     self.assertEqual(DecimalType(11, 1),
                      _parse_datatype_string("decimal(11,1)"))
     self.assertEqual(ArrayType(IntegerType()),
                      _parse_datatype_string("array<int >"))
     self.assertEqual(MapType(IntegerType(), DoubleType()),
                      _parse_datatype_string("map< int, double  >"))
     self.assertEqual(
         StructType([
             StructField("a", IntegerType()),
             StructField("c", DoubleType())
         ]), _parse_datatype_string("struct<a:int, c:double >"))
     self.assertEqual(
         StructType([
             StructField("a", IntegerType()),
             StructField("c", DoubleType())
         ]), _parse_datatype_string("a:int, c:double"))
     self.assertEqual(
         StructType([
             StructField("a", IntegerType()),
             StructField("c", DoubleType())
         ]), _parse_datatype_string("a INT, c DOUBLE"))
コード例 #7
0
def computaBalances(sc, sqlContext, unionDFBalances, df_FondoCuenta,
                    DictBalances):

    #ListaDfBalances = DictBalances["df"]
    Lista_N_GES = DictBalances["N_GES"]

    dfsBalances = creaDFSparkBalances(sqlContext, Lista_N_GES, DictBalances,
                                      df_FondoCuenta)

    #
    columns_to_dropUnionBalances = ['EXIS CED', 'DIFERENCIA']
    unionDFBalances = unionDFBalances.drop(*columns_to_dropUnionBalances)

    unionDFBalances = unionDFBalances.join(dfsBalances,
                                           on='N_GES',
                                           how='inner')

    def eligeEnTabla(epigrafe, col1, col2, col3, col4):
        if epigrafe == "Inversión normal":
            return col1
        if epigrafe == "Contencioso":
            return col2
        if epigrafe == "Amortizado":
            return col3
        if epigrafe == "Impagado":
            return col4

    udfeligeEnTabla = udf(eligeEnTabla, DecimalType(15, 3))

    unionDFBalances= unionDFBalances.withColumn("CONT GES", udfeligeEnTabla( "Epígrafe" ,\
                                                                             "InvNorm"  ,\
                                                                             "Content"  ,\
                                                                             "Amortiz"  ,\
                                                                             "Impagado"  ))

    def calculaDif(exisges, partced):
        return exisges + partced

    udfcalculaDif = udf(calculaDif, DecimalType(15, 3))

    unionDFBalances= unionDFBalances.withColumn("DIFERENCIA", udfcalculaDif( "EXIS GES" ,\
                                                                             "CONT GES"  ))

    unionDFBalances = unionDFBalances.select([ "Fecha CED"          ,\
                                               "Fecha GES"          ,\
                                               "N_GES"              ,\
                                               "Nombre_abreviado"   ,\
                                               "Epígrafe"           ,\
                                               "EXIS GES"           ,\
                                               "CONT GES"           ,\
                                               "DIFERENCIA"]         )

    # Se borran las filas que tengan amortizado y impagado

    unionDFBalances = unionDFBalances.filter( unionDFBalances["Epígrafe"] != 'Amortizado' )\
                                     .filter( unionDFBalances["Epígrafe"] != 'Impagado'   )

    return unionDFBalances
コード例 #8
0
 def load_events_schema(self):
     self.events_schema = StructType(fields=[
         StructField('type', IntegerType(), False),
         StructField('amount', DecimalType(), False),
         StructField('oldbalanceOrg', DecimalType(), False),
         StructField('newbalanceOrig', DecimalType(), False),
         StructField('oldbalanceDest', DecimalType(), False),
         StructField('newbalanceDest', DecimalType(), False)
     ])
コード例 #9
0
    def test_treats_money_with_appropriate_precision(self):
        actual_schema = self.test_processor._clean_movies(
            self.test_movies).select('budget', 'revenue').schema

        expected_schema = StructType([
            StructField('budget', DecimalType(15, 4), True),
            StructField('revenue', DecimalType(15, 4), True),
        ])

        self.assertEqual(actual_schema, expected_schema)
コード例 #10
0
def get_dataframe(filename='data/GBPUSD-2018-12-tick.csv'):
    # open high low close
    schema = StructType([
        StructField("symbol", StringType(), True),
        StructField("time", StringType(), True),
        StructField("bid", DecimalType(precision=10, scale=5), True),
        StructField("ask", DecimalType(precision=10, scale=5), True)
    ])
    # df = spark.read.csv('GBPUSD-2016-11-tick.csv', header='False', schema=schema)
    df = sqlContext.read.csv(filename, header='False', schema=schema)
    return df
コード例 #11
0
def test_convert_strings_to_types(spark_session: SparkSession):
    df = spark_session.createDataFrame([
        ("1", [1, 2, 3], "19-02-2020", "19-02-2020 00:00:00", "true", "1",
         "0.5", "123534627458685341", "123534627458685341"),
        ("2", [1, 2, 3], "19-02-2020", "19-02-2020 00:00:00", "false", "-1",
         "0.7891412462571346431", "234735684679046827457",
         "234735684679046827457"),
        ("3", [1, 2, 3], "19-02-2020", "19-02-2020 00:00:00", "True", "0",
         PI_STRING, PI_STRING, PI_STRING),
        ("4", [1, 2, 3], "19-02-2020", "19-02-2020 00:00:00", "not_true",
         "2147483648", "42", "-143763583573461346.0368479672",
         "-143763583573461346.0368479672"),
        ("5", [1, 2, 3], None, None, None, None, None, None, None),
    ], [
        "id", "array", "date", "timestamp", "boolean", "integer", "double",
        "decimal(38,0)", "decimal(24,5)"
    ])

    transformer = StringTypeTransformer()

    expected_schema = StructType([
        StructField("id", StringType()),
        StructField("array", ArrayType(LongType())),
        StructField("date", DateType()),
        StructField("timestamp", TimestampType()),
        StructField("boolean", BooleanType(), nullable=False),
        StructField("integer", IntegerType()),
        StructField("double", DoubleType()),
        StructField("decimal(38,0)", DecimalType(38, 0)),
        StructField("decimal(24,5)", DecimalType(24, 5))
    ])
    transformed = transformer.transform_dataframe(df, expected_schema)

    assert transformed.schema == expected_schema
    """
    I guess there couldn't be a dataframe with different set of columns for each row.
    Therefore, this assertion is wrong.
    """
    assert transformed.toJSON().collect() == [
        '{"id":"1","array":[1,2,3],"date":"2020-02-19","timestamp":"2020-02-19T00:00:00.000+03:00","boolean":true,'
        '"integer":1,"double":0.5,"decimal(38,0)":123534627458685341,"decimal(24,5)":123534627458685341.00000}',
        '{"id":"2","array":[1,2,3],"date":"2020-02-19","timestamp":"2020-02-19T00:00:00.000+03:00","boolean":false,'
        '"integer":-1,"double":0.7891412462571347,"decimal(38,0)":234735684679046827457}',
        '{"id":"3","array":[1,2,3],"date":"2020-02-19","timestamp":"2020-02-19T00:00:00.000+03:00","boolean":true,'
        '"integer":0,"double":3.141592653589793,"decimal(38,0)":3,"decimal(24,5)":3.14159}',
        '{"id":"4","array":[1,2,3],"date":"2020-02-19","timestamp":"2020-02-19T00:00:00.000+03:00","boolean":false,'
        '"double":42.0,"decimal(38,0)":-143763583573461346,"decimal(24,5)":-143763583573461346.03685}',
        '{"id":"5","array":[1,2,3],"boolean":false}'
    ]
コード例 #12
0
def creaDFSparkBalances(sqlContext, Lista_N_GES, DictBalances, df_FondoCuenta):

    Datos = []
    i = 0

    # Se comprueba que en las pestanas solo hayan valores numericos
    Lista_N_GES = list(filter(lambda x: x.isnumeric(), Lista_N_GES))

    for N_GES in Lista_N_GES:
        N_GESnumber = int(str(N_GES))

        if str(N_GESnumber) in df_FondoCuenta["N_GES"].to_list():

            dato1 = sumaInverNormalBalance(N_GESnumber, DictBalances["df"][i],
                                           df_FondoCuenta)
            dato2 = sumaContensioso(N_GESnumber, DictBalances["df"][i],
                                    df_FondoCuenta)
        else:
            print(
                "El fondo: ", N_GES,
                "captado del fichero Balances no se encuentra en fondo balances"
            )
            dato1 = 0.0
            dato2 = 0.0
        dato3 = 0.0
        dato4 = 0.0
        dato0 = N_GESnumber

        Datos.append([dato0, dato1, dato2, dato3, dato4])
        i = i + 1

    # Crea DataFrame de spark para DictBalance

    field = [StructField( "N_GES"   , IntegerType(), True ),\
             StructField( "InvNorm" , StringType (), True ),\
             StructField( "Content" , StringType (), True ),\
             StructField( "Amortiz" , StringType (), True ),\
             StructField( "Impagado", StringType (), True ),]

    dfoutput = sqlContext.createDataFrame(Datos, StructType(field))

    dfoutput =  dfoutput.withColumn("N_GES"    , dfoutput["N_GES"    ].cast( IntegerType (    ) ) )\
                        .withColumn("InvNorm"  , dfoutput["InvNorm"  ].cast( DecimalType (15,3) ) )\
                        .withColumn("Content"  , dfoutput["Content"  ].cast( DecimalType (15,3) ) )\
                        .withColumn("Amortiz"  , dfoutput["Amortiz"  ].cast( DecimalType (15,3) ) )\
                        .withColumn("Impagado" , dfoutput["Impagado" ].cast( DecimalType (15,3) ) )

    return dfoutput
コード例 #13
0
def passed_temperature_analyse(filename):
    print("begin to analyse passed temperature")
    spark = SparkSession.builder.master("local").appName("passed_temperature_analyse").getOrCreate()
    df = spark.read.csv(filename, header=True)
    df_temperature = df.select(  # 选择需要的列
        df['province'],
        df['city_name'],
        df['city_code'],
        df['temperature'].cast(DecimalType(scale=1)),
        F.date_format(df['time'], "yyyy-MM-dd").alias("date"),  # 得到日期数据
        F.hour(df['time']).alias("hour")  # 得到小时数据
    )
    # 筛选四点时次
    df_4point_temperature = df_temperature.filter(df_temperature['hour'].isin([2, 8, 12, 20]))
    # df_4point_temperature.printSchema()
    df_avg_temperature = df_4point_temperature.groupBy("province", "city_name", "city_code", "date") \
        .agg(F.count("temperature"), F.avg("temperature").alias("avg_temperature")) \
        .filter("count(temperature) = 4") \
        .sort(F.asc("avg_temperature")) \
        .select("province", "city_name", "city_code", "date",
                F.format_number('avg_temperature', 1).alias("avg_temperature"))
    df_avg_temperature.cache()
    avg_temperature_list = df_avg_temperature.collect()
    df_avg_temperature.coalesce(1).write.json("file:///F:/Code_All/Jupyter_Code/spark_test/result_data/bigData/passed_rain_temperature.json")
    print("end analysing passed temperature")
    return avg_temperature_list[0:10]
コード例 #14
0
    def get_avg_time_per_difficulty(self, df, easy_threshold, medium_threshold,
                                    hard_threshold):
        """
        calculate average cooking time duration per difficulty level
        :param df: DataFrame to be process
        :param easy_threshold: Integer easy threshold
        :param medium_threshold: Integer medium threshold
        :param hard_threshold: Integer hard threshold
        :return: transformed DataFrame
        """

        df_filtered_min = df.withColumn('cookTime_in_Min', tominutes(col('cookTime')).cast(IntegerType())) \
            .withColumn('prepTime_in_Min', tominutes(col('prepTime')).cast(IntegerType()))

        df_filtered_with_time = df_filtered_min. \
            withColumn("total_time", (col("cookTime_in_Min") + col("prepTime_in_Min")).cast(
                                                               IntegerType())). \
            withColumn("difficulty",
                       when(col("total_time") < easy_threshold, 'easy').
                       when((col("total_time") >= easy_threshold) & (col("total_time") <= medium_threshold), 'medium').
                       when(col("total_time") > hard_threshold, 'hard')
                       .cast(StringType()))

        result_df = df_filtered_with_time.groupBy(col('difficulty')). \
            agg(avg(col('total_time')).cast(DecimalType(38, 2)).alias("avg_total_cooking_time"))

        return result_df
コード例 #15
0
def _confidence_degree_agg(flat_table: FlatTable, **kwargs) -> PDDataFrame:
    if kwargs.get("group_by_cols", None):
        group_by_cols = kwargs.get("group_by_cols")
    elif flat_table.name == "MCO":
        group_by_cols = frozenset(["ETA_NUM", "RSA_NUM"])
    elif flat_table.name == "MCO_CE":
        group_by_cols = frozenset(["ETA_NUM", "SEQ_NUM"])
    else:
        group_by_cols = None
    col = when(max("count") != 0,
               min("count") / max("count") * 100).otherwise(0)
    df = (reduce(
        lambda a, b: a.union(b),
        [
            _union_not_null_cols_count(
                flat_table,
                flat_table.single_tables[single_table_name],
                nick_name,
                col_name,
                group_by_cols,
            ) for single_table_name, cols in _CNAM_COLS_MAPPING[
                flat_table.name].items()
            for nick_name, col_name in cols.items()
        ],
    ).groupBy("ColName").agg(
        col.cast(DecimalType(32, 2)).alias("ConfidenceDegree")))
    return df.toPandas()
コード例 #16
0
    def test_data_type_ops(self):
        _mock_spark_type = DataType()
        _mock_dtype = ExtensionDtype()
        _mappings = (
            (CategoricalDtype(), _mock_spark_type, CategoricalOps),
            (_mock_dtype, DecimalType(), DecimalOps),
            (_mock_dtype, FractionalType(), FractionalOps),
            (_mock_dtype, IntegralType(), IntegralOps),
            (_mock_dtype, StringType(), StringOps),
            (_mock_dtype, BooleanType(), BooleanOps),
            (_mock_dtype, TimestampType(), DatetimeOps),
            (_mock_dtype, TimestampNTZType(), DatetimeNTZOps),
            (_mock_dtype, DateType(), DateOps),
            (_mock_dtype, DayTimeIntervalType(), TimedeltaOps),
            (_mock_dtype, BinaryType(), BinaryOps),
            (_mock_dtype, ArrayType(StringType()), ArrayOps),
            (_mock_dtype, MapType(StringType(), IntegralType()), MapOps),
            (_mock_dtype, StructType(), StructOps),
            (_mock_dtype, NullType(), NullOps),
            (_mock_dtype, UserDefinedType(), UDTOps),
        )
        for _dtype, _spark_type, _ops in _mappings:
            self.assertIsInstance(DataTypeOps(_dtype, _spark_type), _ops)

        _unknow_spark_type = _mock_spark_type
        self.assertRaises(TypeError, DataTypeOps, BooleanType(),
                          _unknow_spark_type)
コード例 #17
0
def CambiaColumnas_A_Double(DF, ListaColumnas):  #funciona
    # Se cambia el tipo de dato de las columnas elegidas a float
    for Columna in ListaColumnas:
        DF_Nuevo = DF.withColumn(Columna, DF[Columna].cast(DecimalType(15, 3)))

    # print("      Conseguido cambiar a double\n")
    return DF_Nuevo
コード例 #18
0
def getPercentage(DataSet, Column):
    Total = DataSet.count()
    Stats = getFreq(DataSet, Column) \
            .withColumn(PERCENT_STAT, \
                       F.expr("(" + FREQ_STAT + " * 100)/" + str(Total)) \
                       .cast(DecimalType(10,2)))
    return Stats
コード例 #19
0
ファイル: data_type.py プロジェクト: quintoandar/butterfree
class DataType(Enum):
    """Holds constants for data types within Butterfree."""

    TIMESTAMP = (TimestampType(), "timestamp", "TIMESTAMP")
    BINARY = (BinaryType(), "boolean", "BINARY")
    BOOLEAN = (BooleanType(), "boolean", "BOOLEAN")
    DATE = (DateType(), "timestamp", "DATE")
    DECIMAL = (DecimalType(), "decimal", "DECIMAL")
    DOUBLE = (DoubleType(), "double", "DOUBLE")
    FLOAT = (FloatType(), "float", "FLOAT")
    INTEGER = (IntegerType(), "int", "INT")
    BIGINT = (LongType(), "bigint", "BIGINT")
    STRING = (StringType(), "text", "STRING")
    ARRAY_BIGINT = (ArrayType(LongType()), "frozen<list<bigint>>",
                    "ARRAY<BIGINT>")
    ARRAY_STRING = (ArrayType(StringType()), "frozen<list<text>>",
                    "ARRAY<STRING>")
    ARRAY_FLOAT = (ArrayType(FloatType()), "frozen<list<float>>",
                   "ARRAY<FLOAT>")

    def __init__(self, spark: PySparkDataType, cassandra: str,
                 spark_sql: str) -> None:
        self.spark = spark
        self.cassandra = cassandra
        self.spark_sql = spark_sql
コード例 #20
0
ファイル: decimal.py プロジェクト: imranq2/SparkAutoMapper
 def get_column_spec(
     self,
     source_df: Optional[DataFrame],
     current_column: Optional[Column],
     parent_columns: Optional[List[Column]],
 ) -> Column:
     if (
         source_df is not None
         and isinstance(self.value, AutoMapperDataTypeColumn)
         and "decimal" not in dict(source_df.dtypes)[self.value.value]
     ):
         # parse the amount here
         column_spec = self.value.get_column_spec(
             source_df=source_df,
             current_column=current_column,
             parent_columns=parent_columns,
         ).cast(DecimalType(precision=self.precision, scale=self.scale))
         return column_spec
     else:
         # Already a decimal
         column_spec = self.value.get_column_spec(
             source_df=source_df,
             current_column=current_column,
             parent_columns=parent_columns,
         )
         return column_spec
コード例 #21
0
def AnadeColumnaSpark(SQLContext, DFInput, NombreColumna, ListaAInsertar,
                      cambioADouble):

    # Se anade una columna a un df de spark a partir de una lista y poniendole
    # un nombre a la columna

    #Se crea el dataframe de la Lista a Insertar
    DFListaAInsertar = SQLContext.createDataFrame([(l, )
                                                   for l in ListaAInsertar],
                                                  [NombreColumna])

    #add 'sequential' index and join both dataframe to get the final result
    DFInput = DFInput.withColumn(
        "row_idx",
        row_number().over(Window.orderBy(monotonically_increasing_id())))
    DFListaAInsertar = DFListaAInsertar.withColumn(
        "row_idx",
        row_number().over(Window.orderBy(monotonically_increasing_id())))

    DFSalida = DFInput.join(
        DFListaAInsertar,
        DFInput.row_idx == DFListaAInsertar.row_idx).drop("row_idx")

    if cambioADouble == True:
        DFSalida = DFSalida.withColumn(
            NombreColumna, DFSalida[NombreColumna].cast(DecimalType(15, 3)))

    return DFSalida
コード例 #22
0
    def test_returns_correct_schema(self):
        actual_schema = self.test_processor.all_movies.schema
        expected_schema = StructType([
            StructField('title', StringType(), True),
            StructField('production_companies', ArrayType(StringType(), True),
                        True),
            StructField('release_date', DateType(), True),
            StructField('rating', DecimalType(10, 6), True),
            StructField('revenue_budget_ratio', DecimalType(8, 2), True),
            StructField('budget', DecimalType(15, 4), True),
            StructField('revenue', DecimalType(15, 4), True),
            StructField('year', IntegerType(), True),
            StructField('movie_id', StringType(), False)
        ])

        self.assertEqual(actual_schema, expected_schema)
コード例 #23
0
def test_make_named_tuple():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_scalar', np.string_,
                       (), ScalarCodec(StringType()), True),
        UnischemaField('int32_scalar', np.int32,
                       (), ScalarCodec(ShortType()), False),
        UnischemaField('uint8_scalar', np.uint8,
                       (), ScalarCodec(ShortType()), False),
        UnischemaField('int32_matrix', np.float32,
                       (10, 20, 3), NdarrayCodec(), True),
        UnischemaField('decimal_scalar', Decimal,
                       (10, 20, 3), ScalarCodec(DecimalType(10, 9)), False),
    ])

    TestSchema.make_namedtuple(string_scalar='abc',
                               int32_scalar=10,
                               uint8_scalar=20,
                               int32_matrix=np.int32((10, 20, 3)),
                               decimal_scalar=Decimal(123) / Decimal(10))

    TestSchema.make_namedtuple(string_scalar=None,
                               int32_scalar=10,
                               uint8_scalar=20,
                               int32_matrix=None,
                               decimal_scalar=Decimal(123) / Decimal(10))
コード例 #24
0
    def setUpClass(cls):
        from datetime import date, datetime
        from decimal import Decimal
        super(ArrowTests, cls).setUpClass()
        cls.warnings_lock = threading.Lock()

        # Synchronize default timezone between Python and Java
        cls.tz_prev = os.environ.get("TZ", None)  # save current tz if set
        tz = "America/Los_Angeles"
        os.environ["TZ"] = tz
        time.tzset()

        cls.spark.conf.set("spark.sql.session.timeZone", tz)

        # Test fallback
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "false"
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "true"

        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true"
        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false"

        # Enable Arrow optimization in this tests.
        cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
        # Disable fallback by default to easily detect the failures.
        cls.spark.conf.set(
            "spark.sql.execution.arrow.pyspark.fallback.enabled", "false")

        cls.schema_wo_null = StructType([
            StructField("1_str_t", StringType(), True),
            StructField("2_int_t", IntegerType(), True),
            StructField("3_long_t", LongType(), True),
            StructField("4_float_t", FloatType(), True),
            StructField("5_double_t", DoubleType(), True),
            StructField("6_decimal_t", DecimalType(38, 18), True),
            StructField("7_date_t", DateType(), True),
            StructField("8_timestamp_t", TimestampType(), True),
            StructField("9_binary_t", BinaryType(), True)
        ])
        cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True)
        cls.data_wo_null = [
            (u"a", 1, 10, 0.2, 2.0, Decimal("2.0"), date(1969, 1, 1),
             datetime(1969, 1, 1, 1, 1, 1), bytearray(b"a")),
            (u"b", 2, 20, 0.4, 4.0, Decimal("4.0"), date(2012, 2, 2),
             datetime(2012, 2, 2, 2, 2, 2), bytearray(b"bb")),
            (u"c", 3, 30, 0.8, 6.0, Decimal("6.0"), date(2100, 3, 3),
             datetime(2100, 3, 3, 3, 3, 3), bytearray(b"ccc")),
            (u"d", 4, 40, 1.0, 8.0, Decimal("8.0"), date(2262, 4, 12),
             datetime(2262, 3, 3, 3, 3, 3), bytearray(b"dddd")),
        ]
        cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
コード例 #25
0
def _spark_replace_decimal_fields(dataframe):
    # list all fields and types from dataframe
    df_columns = {}
    for field in dataframe.schema.fields:
        df_columns.update({field.name: field.dataType})

    new_dataframe = dataframe
    decimal_types = (DecimalType(38, 10), DecimalType(38, 0))

    for column, data_type in df_columns.items():
        # decimal type is transformed to double type
        if isinstance(data_type,
                      DecimalType) and data_type not in decimal_types:
            new_dataframe = new_dataframe.withColumn(
                column, new_dataframe[column].cast(DoubleType()))

    return new_dataframe
コード例 #26
0
 def get_users_schema(self):
     schema = StructType([
         StructField("id", IntegerType(), False),  #_C0
         StructField("login", StringType(), True),  #_c1
         StructField("company", StringType(), True),  #_c2
         StructField("created_at", TimestampType(), True),  #_c3
         StructField("user_type", StringType(), True),  #_c4
         StructField("fake", IntegerType(), True),  #_c5
         StructField("deleted", IntegerType(), True),  #_c6
         StructField("long", DecimalType(), True),  #_c7
         StructField("lat", DecimalType(), True),  #_c8
         StructField("country_code", StringType(), True),
         StructField("state", StringType(), True),
         StructField("city", StringType(), True),  #_c9
         StructField("location", StringType(), True)
     ])
     return schema
コード例 #27
0
ファイル: etl.py プロジェクト: marystory/Data-Lake-with-Spark
def get_song_schema():
    """
    Create a schema to use for the song data.
    :return: StructType object 
    """
    schema = StructType([
        StructField('artist_id', StringType(), True),
        StructField('artist_latitude', DecimalType(), True),
        StructField('artist_longitude', DecimalType(), True),
        StructField('artist_location', StringType(), True),
        StructField('artist_name', StringType(), True),
        StructField('duration', DoubleType(), True),
        StructField('num_songs', IntegerType(), True),
        StructField('song_id', StringType(), True),
        StructField('title', StringType(), True),
        StructField('year', IntegerType(), True)
    ])
    return schema
コード例 #28
0
 def test_negative_decimal(self):
     try:
         self.spark.sql("set spark.sql.legacy.allowNegativeScaleOfDecimal=true")
         df = self.spark.createDataFrame([(1, ), (11, )], ["value"])
         ret = df.select(col("value").cast(DecimalType(1, -1))).collect()
         actual = list(map(lambda r: int(r.value), ret))
         self.assertEqual(actual, [0, 10])
     finally:
         self.spark.sql("set spark.sql.legacy.allowNegativeScaleOfDecimal=false")
コード例 #29
0
def check_column_numeric(df, column):
    return df.schema[column].dataType in [
        IntegerType(),
        ShortType(),
        LongType(),
        FloatType(),
        DecimalType(),
        DoubleType()
    ]
コード例 #30
0
ファイル: unischema.py プロジェクト: rgruener/petastorm
    def from_arrow_schema(cls, parquet_dataset):
        """
        Convert an apache arrow schema into a unischema object. This is useful for datasets of only scalars
        which need no special encoding/decoding. If there is an unsupported type in the arrow schema, it will
        throw an exception.

        :param arrow_schema: :class:`pyarrow.lib.Schema`
        :return: A :class:`Unischema` object.
        """
        meta = parquet_dataset.pieces[0].get_metadata(parquet_dataset.fs.open)
        arrow_schema = meta.schema.to_arrow_schema()
        unischema_fields = []

        for partition_name in parquet_dataset.partitions.partition_names:
            unischema_fields.append(UnischemaField(partition_name, np.str_, (), ScalarCodec(StringType()), False))

        for column_name in arrow_schema.names:
            arrow_field = arrow_schema.field_by_name(column_name)
            field_type = arrow_field.type
            if field_type == pyarrow.int8():
                np_type = np.int8
                codec = ScalarCodec(ByteType())
            elif field_type == pyarrow.int16():
                np_type = np.int16
                codec = ScalarCodec(ShortType())
            elif field_type == pyarrow.int32():
                np_type = np.int32
                codec = ScalarCodec(IntegerType())
            elif field_type == pyarrow.int64():
                np_type = np.int64
                codec = ScalarCodec(LongType())
            elif field_type == pyarrow.string():
                np_type = np.unicode_
                codec = ScalarCodec(StringType())
            elif field_type == pyarrow.bool_():
                np_type = np.bool_
                codec = ScalarCodec(BooleanType())
            elif field_type == pyarrow.float32():
                np_type = np.float32
                codec = ScalarCodec(FloatType())
            elif field_type == pyarrow.float64():
                np_type = np.float64
                codec = ScalarCodec(DoubleType())
            elif isinstance(field_type, pyarrow.lib.Decimal128Type):
                np_type = Decimal
                codec = ScalarCodec(DecimalType(field_type.precision, field_type.scale))
            elif field_type == pyarrow.binary():
                np_type = np.string_
                codec = ScalarCodec(StringType())
            elif isinstance(field_type, pyarrow.lib.FixedSizeBinaryType):
                np_type = np.string_
                codec = ScalarCodec(StringType())
            else:
                raise ValueError('Cannot auto-create unischema due to unsupported column type {}'.format(field_type))

            unischema_fields.append(UnischemaField(column_name, np_type, (), codec, arrow_field.nullable))
        return Unischema('inferred_schema', unischema_fields)