Beispiel #1
0
 def sqlType(self):
     return ArrayType(DoubleType(), False)
                                     F.udf(str2datestr)(
                                         F.col('dt')))  #.cast(DateType()))
        data_df = logs_df.select(useful_columns)
        data_df = preprocessing(data_df)
        data_df.write.csv('../data/round1_train/logs_66.csv',
                          header=True,
                          sep=',',
                          mode='overwrite')
    else:
        data_df = spark.read.csv('../data/round1_train/logs_66.csv',
                                 header=True)
        smart_columns = [
            column for column in data_df.columns if 'smart' in column
        ] + ['anomaly_sum']
        for col in smart_columns:
            data_df = data_df.withColumn(col, F.col(col).cast(DoubleType()))

        #data_df=data_df.withColumn('dt',F.col('dt').cast(DateType()))
    # feature cross

    data_df = data_df.withColumn('smart_4raw', F.col('smart_4raw') / 12)
    data_df = data_df.withColumn('smart_5raw', F.col('smart_5raw') / 16)
    data_df = data_df.withColumn('smart_191raw', F.col('smart_191raw') / 18)
    data_df = data_df.withColumn('smart_198raw', F.col('smart_198raw') / 18)
    data_df = data_df.withColumn('smart_197raw', F.col('smart_197raw') / 18)
    data_df = data_df.withColumn('smart_187raw', F.col('smart_187raw') / 15)
    cross_columns = [
        'smart_4raw', 'smart_5raw', 'smart_187raw', 'smart_191raw',
        'smart_197raw', 'smart_198raw'
    ]
    for i in range(len(cross_columns)):
Beispiel #3
0
def align_diff_frames(
    resolve_func,
    this: "DataFrame",
    that: "DataFrame",
    fillna: bool = True,
    how: str = "full",
    preserve_order_column: bool = False,
) -> "DataFrame":
    """
    This method aligns two different DataFrames with a given `func`. Columns are resolved and
    handled within the given `func`.
    To use this, `compute.ops_on_diff_frames` should be True, for now.

    :param resolve_func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and
        the column of another DataFrame. It returns an iterable that produces Series.

        >>> from pyspark.pandas.config import set_option, reset_option
        >>>
        >>> set_option("compute.ops_on_diff_frames", True)
        >>>
        >>> kdf1 = ps.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>> kdf2 = ps.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>>
        >>> def func(kdf, this_column_labels, that_column_labels):
        ...    kdf  # conceptually this is A + B.
        ...
        ...    # Within this function, Series from A or B can be performed against `kdf`.
        ...    this_label = this_column_labels[0]  # this is ('a',) from kdf1.
        ...    that_label = that_column_labels[0]  # this is ('a',) from kdf2.
        ...    new_series = (kdf[this_label] - kdf[that_label]).rename(str(this_label))
        ...
        ...    # This new series will be placed in new DataFrame.
        ...    yield (new_series, this_label)
        >>>
        >>>
        >>> align_diff_frames(func, kdf1, kdf2).sort_index()
           a
        0  0
        1  0
        2  0
        3  0
        4  0
        5  0
        6  0
        7  0
        8  0
        >>> reset_option("compute.ops_on_diff_frames")

    :param this: a DataFrame to align
    :param that: another DataFrame to align
    :param fillna: If True, it fills missing values in non-common columns in both `this` and `that`.
        Otherwise, it returns as are.
    :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict.
        - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and
            'that_columns' in this function are B, C and B, C.
        - left: `resolve_func` should resolve columns including that columns.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is
            B, C but `that_columns` are B, C, D.
        - inner: Same as 'full' mode; however, internally performs inner join instead.
    :return: Aligned DataFrame
    """
    from pyspark.pandas.frame import DataFrame

    assert how == "full" or how == "left" or how == "inner"

    this_column_labels = this._internal.column_labels
    that_column_labels = that._internal.column_labels
    common_column_labels = set(this_column_labels).intersection(
        that_column_labels)

    # 1. Perform the join given two dataframes.
    combined = combine_frames(this,
                              that,
                              how=how,
                              preserve_order_column=preserve_order_column)

    # 2. Apply the given function to transform the columns in a batch and keep the new columns.
    combined_column_labels = combined._internal.column_labels

    that_columns_to_apply = []
    this_columns_to_apply = []
    additional_that_columns = []
    columns_to_keep = []
    column_labels_to_keep = []

    for combined_label in combined_column_labels:
        for common_label in common_column_labels:
            if combined_label == tuple(["this", *common_label]):
                this_columns_to_apply.append(combined_label)
                break
            elif combined_label == tuple(["that", *common_label]):
                that_columns_to_apply.append(combined_label)
                break
        else:
            if how == "left" and combined_label in [
                    tuple(["that", *label]) for label in that_column_labels
            ]:
                # In this case, we will drop `that_columns` in `columns_to_keep` but passes
                # it later to `func`. `func` should resolve it.
                # Note that adding this into a separate list (`additional_that_columns`)
                # is intentional so that `this_columns` and `that_columns` can be paired.
                additional_that_columns.append(combined_label)
            elif fillna:
                columns_to_keep.append(
                    F.lit(None).cast(DoubleType()).alias(str(combined_label)))
                column_labels_to_keep.append(combined_label)
            else:
                columns_to_keep.append(combined._kser_for(combined_label))
                column_labels_to_keep.append(combined_label)

    that_columns_to_apply += additional_that_columns

    # Should extract columns to apply and do it in a batch in case
    # it adds new columns for example.
    if len(this_columns_to_apply) > 0 or len(that_columns_to_apply) > 0:
        kser_set, column_labels_applied = zip(*resolve_func(
            combined, this_columns_to_apply, that_columns_to_apply))
        columns_applied = list(kser_set)
        column_labels_applied = list(column_labels_applied)
    else:
        columns_applied = []
        column_labels_applied = []

    applied = DataFrame(
        combined._internal.with_new_columns(
            columns_applied + columns_to_keep,
            column_labels=column_labels_applied + column_labels_to_keep,
        ))  # type: DataFrame

    # 3. Restore the names back and deduplicate columns.
    this_labels = OrderedDict()
    # Add columns in an order of its original frame.
    for this_label in this_column_labels:
        for new_label in applied._internal.column_labels:
            if new_label[1:] not in this_labels and this_label == new_label[1:]:
                this_labels[new_label[1:]] = new_label

    # After that, we will add the rest columns.
    other_labels = OrderedDict()
    for new_label in applied._internal.column_labels:
        if new_label[1:] not in this_labels:
            other_labels[new_label[1:]] = new_label

    kdf = applied[list(this_labels.values()) + list(other_labels.values())]
    kdf.columns = kdf.columns.droplevel()
    return kdf
    def test_supported_types(self):

        values = [
            1, 2, 3, 4, 5, 1.1, 2.2,
            Decimal(1.123), [1, 2, 2], True, 'hello',
            bytearray([0x01, 0x02])
        ]
        output_fields = [('id', IntegerType()), ('byte', ByteType()),
                         ('short', ShortType()), ('int', IntegerType()),
                         ('long', LongType()), ('float', FloatType()),
                         ('double', DoubleType()),
                         ('decim', DecimalType(10, 3)),
                         ('array', ArrayType(IntegerType())),
                         ('bool', BooleanType()), ('str', StringType()),
                         ('bin', BinaryType())]

        output_schema = StructType([StructField(*x) for x in output_fields])
        df = self.spark.createDataFrame([values], schema=output_schema)

        # Different forms of group map pandas UDF, results of these are the same
        udf1 = pandas_udf(
            lambda pdf: pdf.assign(byte=pdf.byte * 2,
                                   short=pdf.short * 2,
                                   int=pdf.int * 2,
                                   long=pdf.long * 2,
                                   float=pdf.float * 2,
                                   double=pdf.double * 2,
                                   decim=pdf.decim * 2,
                                   bool=False if pdf.bool else True,
                                   str=pdf.str + 'there',
                                   array=pdf.array,
                                   bin=pdf.bin), output_schema,
            PandasUDFType.GROUPED_MAP)

        udf2 = pandas_udf(
            lambda _, pdf: pdf.assign(byte=pdf.byte * 2,
                                      short=pdf.short * 2,
                                      int=pdf.int * 2,
                                      long=pdf.long * 2,
                                      float=pdf.float * 2,
                                      double=pdf.double * 2,
                                      decim=pdf.decim * 2,
                                      bool=False if pdf.bool else True,
                                      str=pdf.str + 'there',
                                      array=pdf.array,
                                      bin=pdf.bin), output_schema,
            PandasUDFType.GROUPED_MAP)

        udf3 = pandas_udf(
            lambda key, pdf: pdf.assign(id=key[0],
                                        byte=pdf.byte * 2,
                                        short=pdf.short * 2,
                                        int=pdf.int * 2,
                                        long=pdf.long * 2,
                                        float=pdf.float * 2,
                                        double=pdf.double * 2,
                                        decim=pdf.decim * 2,
                                        bool=False if pdf.bool else True,
                                        str=pdf.str + 'there',
                                        array=pdf.array,
                                        bin=pdf.bin), output_schema,
            PandasUDFType.GROUPED_MAP)

        result1 = df.groupby('id').apply(udf1).sort('id').toPandas()
        expected1 = df.toPandas().groupby('id').apply(
            udf1.func).reset_index(drop=True)

        result2 = df.groupby('id').apply(udf2).sort('id').toPandas()
        expected2 = expected1

        result3 = df.groupby('id').apply(udf3).sort('id').toPandas()
        expected3 = expected1

        assert_frame_equal(expected1, result1)
        assert_frame_equal(expected2, result2)
        assert_frame_equal(expected3, result3)
    def read_data(self):

        userSchema = StructType([
                StructField('medallion', StringType()),
                StructField('pickup_time', TimestampType()),
                StructField('total_amount', DoubleType()),
                ])

        self.fare = self.spark \
            .readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", "localhost:9092") \
            .option("subscribe", "nycfare1") \
            .option("startingOffsets", "earliest") \
            .option('failOnDataLoss','false') \
            .option("maxOffsetsPerTrigger", 1000) \
            .load()

        self.df_fare = self.fare.selectExpr("CAST(value as STRING) as json") \
                   .select(from_json("json", userSchema).alias('data'))\
                   .selectExpr(
                        "data.medallion as medallion_fare",
                        "cast (data.pickup_time as timestamp) as pickup_time_fare",
                        "cast (data.total_amount as float)",
                    )

        userSchema = StructType([
            StructField('medallion', StringType()),
            StructField('pickup_time', TimestampType()),
            StructField('dropoff_time', TimestampType()),
            StructField('passenger_count', IntegerType()),
            StructField('trip_time', IntegerType()),
            StructField('trip_distance', DoubleType()),
            StructField('pickup_loc', MapType(StringType(), DoubleType())),
            StructField('dropoff_loc', MapType(StringType(), DoubleType()))
        ])

        self.trip = self.spark \
            .readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", "localhost:9092") \
            .option("subscribe", "nycspeed9") \
            .option("startingOffsets", "earliest") \
            .option('failOnDataLoss', 'false') \
            .option("maxOffsetsPerTrigger", 1000) \
            .load()

        self.df_trip = self.trip.selectExpr("CAST(value as STRING) as json") \
            .select(from_json("json", userSchema).alias('data')) \
            .selectExpr(
            "data.medallion as medallion_trip",
            "cast (data.pickup_time as timestamp) as pickup_time_trip",
            "cast (data.dropoff_time as timestamp)",
            "cast (data.passenger_count as integer)",
            "cast (data.trip_time as integer)",
            "cast (data.trip_distance as float)",
            "cast (data.pickup_loc.lat as float) as pickup_loc_lat",
            # "cast data.pickup_loc.lat as pickup_loc_lat"
            "cast (data.pickup_loc.lon as float) as pickup_loc_lon",
            # "cast data.pickup_loc.lon as pickup_loc_lon",
            "cast (data.dropoff_loc.lat as float) as dropoff_loc_lat",
            # "cast data.dropoff_loc.lat as dropoff_loc_lat",
            "cast (data.dropoff_loc.lon as float) as dropoff_loc_lon",
            # "cast data.dropoff_loc.lon as dropoff_loc_lon"
        )

        print(self.df_trip.printSchema())

        self.df = self.df_trip.join(
            self.df_fare,
            expr("""
            medallion_trip = medallion_fare AND
            pickup_time_trip >= pickup_time_fare - interval 1 hour AND
            pickup_time_trip <= pickup_time_fare + interval 1 hour
            """)
        )

        print((self.df \
              .writeStream \
              .outputMode("append") \
              .format("console") \
              .option('truncate','false')
              .option('numRows', 20)
              .start()
              .awaitTermination()
              ))

        query = self.windowedCounts.writeStream \
            .outputMode("append") \
            .queryName("writing_to_es") \
            .format("org.elasticsearch.spark.sql") \
            .option("checkpointLocation", "/tmp/1") \
            .option("es.nodes", "localhost") \
            .option("es.port", "9200") \
            .option("es.resource", "nycfare2/_doc") \

        query.start().awaitTermination()
Beispiel #6
0
    def test_infer_schema_from_pandas_instances(self):
        def func() -> pd.Series[int]:
            pass

        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtype, np.int64)
        self.assertEqual(inferred.spark_type, LongType())

        def func() -> pd.Series[np.float]:
            pass

        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtype, np.float64)
        self.assertEqual(inferred.spark_type, DoubleType())

        def func() -> "pd.DataFrame[np.float, str]":
            pass

        expected = StructType(
            [StructField("c0", DoubleType()),
             StructField("c1", StringType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
        self.assertEqual(inferred.spark_type, expected)

        def func() -> "pandas.DataFrame[np.float]":
            pass

        expected = StructType([StructField("c0", DoubleType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64])
        self.assertEqual(inferred.spark_type, expected)

        def func() -> "pd.Series[int]":
            pass

        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtype, np.int64)
        self.assertEqual(inferred.spark_type, LongType())

        def func() -> pd.DataFrame[np.float, str]:
            pass

        expected = StructType(
            [StructField("c0", DoubleType()),
             StructField("c1", StringType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
        self.assertEqual(inferred.spark_type, expected)

        def func() -> pd.DataFrame[np.float]:
            pass

        expected = StructType([StructField("c0", DoubleType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64])
        self.assertEqual(inferred.spark_type, expected)

        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})

        def func() -> pd.DataFrame[pdf.dtypes]:  # type: ignore
            pass

        expected = StructType(
            [StructField("c0", LongType()),
             StructField("c1", LongType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.int64, np.int64])
        self.assertEqual(inferred.spark_type, expected)

        pdf = pd.DataFrame({
            "a": [1, 2, 3],
            "b": pd.Categorical(["a", "b", "c"])
        })

        def func() -> pd.Series[pdf.b.dtype]:  # type: ignore
            pass

        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtype,
                         CategoricalDtype(categories=["a", "b", "c"]))
        self.assertEqual(inferred.spark_type, LongType())

        def func() -> pd.DataFrame[pdf.dtypes]:  # type: ignore
            pass

        expected = StructType(
            [StructField("c0", LongType()),
             StructField("c1", LongType())])
        inferred = infer_return_type(func)
        self.assertEqual(
            inferred.dtypes,
            [np.int64, CategoricalDtype(categories=["a", "b", "c"])])
        self.assertEqual(inferred.spark_type, expected)
Beispiel #7
0
    mnistDF = spark.createDataFrame(pd_df)
    (trainingDF, validationDF) = mnistDF.randomSplit([0.8, 0.2])
    trainingDF.show()

    # define loss with Pytorch API
    def lossFunc(input, target):
        return nn.CrossEntropyLoss().forward(input, target.flatten().long())

    torch_model = LeNet()
    model = TorchNet.from_pytorch(torch_model, [1, 1, 28, 28])
    criterion = TorchCriterion.from_pytorch(lossFunc, [1, 10],
                                            torch.LongTensor([5]))
    classifier = NNClassifier(model, criterion, SeqToTensor([1, 28, 28])) \
        .setBatchSize(256) \
        .setOptimMethod(Adam()) \
        .setLearningRate(0.001)\
        .setMaxEpoch(2)

    nnClassifierModel = classifier.fit(trainingDF)

    print("After training: ")
    shift = udf(lambda p: p - 1, DoubleType())
    res = nnClassifierModel.transform(validationDF) \
        .withColumn("prediction", shift(col('prediction')))
    res.show(100)

    correct = res.filter("label=prediction").count()
    overall = res.count()
    accuracy = correct * 1.0 / overall
    print("Validation accuracy = %g " % accuracy)
Beispiel #8
0
# MAGIC %md
# MAGIC ### Step 1: Declare the schema.
# MAGIC
# MAGIC This is a list of field names and data types.

# COMMAND ----------

from pyspark.sql.types import DoubleType, IntegerType, StringType, StructField, StructType

csvSchema = StructType([
    StructField("ProductID", IntegerType()),
    StructField("Name", StringType()),
    StructField("ProductNumber", StringType()),
    StructField("Color", StringType()),
    StructField("StandardCost", DoubleType()),
    StructField("ListPrice", DoubleType()),
    StructField("Size", StringType()),
    StructField("Weight", StringType()),
    StructField("ProductCategoryID", IntegerType()),
    StructField("ProductModelID", IntegerType()),
    StructField("SellStartDate", StringType()),
    StructField("SellEndDate", StringType()),
    StructField("DiscountedDate", StringType()),
    StructField("ThumbNailPhoto", StringType()),
    StructField("ThumbnailPhotoFileName", StringType()),
    StructField("rowguid", StringType()),
    StructField("ModifiedDate", StringType())
])

# COMMAND ----------
    StructField("ID", StringType(), True),
    StructField("CaseNumber", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("Block", StringType(), True),
    StructField("IUCR", StringType(), True),
    StructField("PrimaryType", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("LocationDescription", StringType(), True),
    StructField("Arrest", BooleanType(), True),
    StructField("Domestic", BooleanType(), True),
    StructField("Beat", StringType(), True),
    StructField("District", StringType(), True),
    StructField("Ward", StringType(), True),
    StructField("CommunityArea", StringType(), True),
    StructField("FBICode", StringType(), True),
    StructField("XCoordinate", DoubleType(), True),
    StructField("YCoordinate", DoubleType(), True),
    StructField("Year", IntegerType(), True),
    StructField("UpdatedOn", DateType(), True),
    StructField("Latitude", DoubleType(), True),
    StructField("Longitude", DoubleType(), True),
    StructField("Location", StringType(), True)
])

#crimes = spark.read.csv("gs://chic_crime/version1/ccd_sample.csv",header = True,schema = crimes_schema)
crimes = spark.read.csv("Chicago_Crimes_2012_to_2017.csv",
                        header=True,
                        schema=crimes_schema)
print(" The crimes dataframe has {} records".format(crimes.count()))
print(crimes.select("PrimaryType").distinct().show(n=5))
Beispiel #10
0
    # def close(self, error):
    #     # Close the connection. This method in optional in Python.
    #     pass


streamingDF = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "time-series") \
    .option('includeTimestamp', 'true') \
    .load()

deserialize_row_udf = udf(lambda x: deserialize_avro_column_row(x),
                          DoubleType())

deserialized_value_dataframe = streamingDF.withColumn(
    'deserialized_value', deserialize_row_udf("value"))
deserialized_value_dataframe = deserialized_value_dataframe.select(
    ['key', 'timestamp', 'deserialized_value'])

deserialized_value_dataframe.drop('value')
deserialized_value_dataframe = deserialized_value_dataframe.withColumnRenamed(
    'deserialized_value', 'value')

#     .outputMode("append")\


class ForeachWriter:
    def open(self, partition_id, epoch_id):
Beispiel #11
0
schemaNames = df_s.columns
labels = schemaNames[0]
feature_names = schemaNames[1:]

column_names = [
    'labels', 'lepton_pT', 'lepton_eta', 'lepton_phi',
    'missing_energy_magnitude', 'missing_energy_phi', 'jet_1_pt', 'jet_1_eta',
    'jet_1_phi', 'jet_1_b_tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi',
    'jet_2_b-tag', 'jet_3_pt', 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag',
    'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj',
    'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb'
]

# Changing labels types
df_s = df_s.withColumn(labels,
                       df_s[labels].cast(DoubleType()).cast(IntegerType()))

# Changing types of data in columns
for column in feature_names:
    df_s = df_s.withColumn(column, df_s[column].cast(DoubleType()))

print("Split Train/Test data...")
(trainingData, testData) = df_s.randomSplit([0.7, 0.3], 123)
trainingData.cache()
testData.cache()

assembler = VectorAssembler(inputCols=feature_names, outputCol='features')
trainingData = assembler.transform(trainingData).select("features", labels)
testData = assembler.transform(testData).select("features", labels)

trainingData.cache(), testData.cache()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

spark = SparkSession\
    .builder\
    .appName("Test_number_of_stages")\
    .getOrCreate()

schema_ratings = StructType([
       StructField("userId",IntegerType()),\
       StructField("movieId", IntegerType()),\
       StructField("rating", DoubleType()),\
       StructField("timestamp",StringType())
    ])

schema_movies = StructType([
    StructField("ID_movie",IntegerType()),\
    StructField("Name_movie",StringType()),\
    StructField("Stype_movie",StringType())
    ])
# stage load data
df_ratings = spark.read.format("csv").schema(schema_ratings).load(
    "/nhatthanh/data/ml-20m/ratings.csv")
df_movies = spark.read.format("csv").schema(schema_movies).load(
    "/nhatthanh/data/ml-20m/movies.csv")

df_ratings.show()
df_movies.show()
# stage join
# ra = df_ratings.alias('ra')
Beispiel #13
0
    def setUpClass(cls):
        from datetime import date, datetime
        from decimal import Decimal

        super(ArrowTests, cls).setUpClass()
        cls.warnings_lock = threading.Lock()

        # Synchronize default timezone between Python and Java
        cls.tz_prev = os.environ.get("TZ", None)  # save current tz if set
        tz = "America/Los_Angeles"
        os.environ["TZ"] = tz
        time.tzset()

        cls.spark.conf.set("spark.sql.session.timeZone", tz)

        # Test fallback
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "false"
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "true"

        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true"
        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false"

        # Enable Arrow optimization in this tests.
        cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
        # Disable fallback by default to easily detect the failures.
        cls.spark.conf.set(
            "spark.sql.execution.arrow.pyspark.fallback.enabled", "false")

        cls.schema_wo_null = StructType([
            StructField("1_str_t", StringType(), True),
            StructField("2_int_t", IntegerType(), True),
            StructField("3_long_t", LongType(), True),
            StructField("4_float_t", FloatType(), True),
            StructField("5_double_t", DoubleType(), True),
            StructField("6_decimal_t", DecimalType(38, 18), True),
            StructField("7_date_t", DateType(), True),
            StructField("8_timestamp_t", TimestampType(), True),
            StructField("9_binary_t", BinaryType(), True),
        ])
        cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True)
        cls.data_wo_null = [
            (
                "a",
                1,
                10,
                0.2,
                2.0,
                Decimal("2.0"),
                date(1969, 1, 1),
                datetime(1969, 1, 1, 1, 1, 1),
                bytearray(b"a"),
            ),
            (
                "b",
                2,
                20,
                0.4,
                4.0,
                Decimal("4.0"),
                date(2012, 2, 2),
                datetime(2012, 2, 2, 2, 2, 2),
                bytearray(b"bb"),
            ),
            (
                "c",
                3,
                30,
                0.8,
                6.0,
                Decimal("6.0"),
                date(2100, 3, 3),
                datetime(2100, 3, 3, 3, 3, 3),
                bytearray(b"ccc"),
            ),
            (
                "d",
                4,
                40,
                1.0,
                8.0,
                Decimal("8.0"),
                date(2262, 4, 12),
                datetime(2262, 3, 3, 3, 3, 3),
                bytearray(b"dddd"),
            ),
        ]
        cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
Beispiel #14
0
 def test_wrong_args(self):
     left = self.data1
     right = self.data2
     with self.assertRaisesRegex(ValueError, "Invalid function"):
         left.groupby("id").cogroup(right.groupby("id")).applyInPandas(
             lambda: 1, StructType([StructField("d", DoubleType())]))
Beispiel #15
0
    def test_infer_schema_with_names_pandas_instances(self):
        def func() -> 'pd.DataFrame["a" : np.float, "b":str]':  # noqa: F821
            pass

        expected = StructType(
            [StructField("a", DoubleType()),
             StructField("b", StringType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
        self.assertEqual(inferred.spark_type, expected)

        def func() -> "pd.DataFrame['a': np.float, 'b': int]":  # noqa: F821
            pass

        expected = StructType(
            [StructField("a", DoubleType()),
             StructField("b", LongType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64, np.int64])
        self.assertEqual(inferred.spark_type, expected)

        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})

        def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
            pass

        expected = StructType(
            [StructField("a", LongType()),
             StructField("b", LongType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.int64, np.int64])
        self.assertEqual(inferred.spark_type, expected)

        pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]})

        def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
            pass

        expected = StructType([
            StructField("(x, a)", LongType()),
            StructField("(y, b)", LongType())
        ])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.int64, np.int64])
        self.assertEqual(inferred.spark_type, expected)

        pdf = pd.DataFrame({
            "a": [1, 2, 3],
            "b": pd.Categorical(["a", "b", "c"])
        })

        def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
            pass

        expected = StructType(
            [StructField("a", LongType()),
             StructField("b", LongType())])
        inferred = infer_return_type(func)
        self.assertEqual(
            inferred.dtypes,
            [np.int64, CategoricalDtype(categories=["a", "b", "c"])])
        self.assertEqual(inferred.spark_type, expected)
Beispiel #16
0
 def _udf(f, returnType=DoubleType(), arg_type="pandas"):
     return FlintUserDefinedFunction(f, returnType, arg_type=arg_type)
Beispiel #17
0
    def test_as_spark_type_koalas_dtype(self):
        type_mapper = {
            # binary
            np.character: (np.character, BinaryType()),
            np.bytes_: (np.bytes_, BinaryType()),
            np.string_: (np.bytes_, BinaryType()),
            bytes: (np.bytes_, BinaryType()),
            # integer
            np.int8: (np.int8, ByteType()),
            np.byte: (np.int8, ByteType()),
            np.int16: (np.int16, ShortType()),
            np.int32: (np.int32, IntegerType()),
            np.int64: (np.int64, LongType()),
            np.int: (np.int64, LongType()),
            int: (np.int64, LongType()),
            # floating
            np.float32: (np.float32, FloatType()),
            np.float: (np.float64, DoubleType()),
            np.float64: (np.float64, DoubleType()),
            float: (np.float64, DoubleType()),
            # string
            np.str: (np.unicode_, StringType()),
            np.unicode_: (np.unicode_, StringType()),
            str: (np.unicode_, StringType()),
            # bool
            np.bool: (np.bool, BooleanType()),
            bool: (np.bool, BooleanType()),
            # datetime
            np.datetime64: (np.datetime64, TimestampType()),
            datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()),
            # DateType
            datetime.date: (np.dtype("object"), DateType()),
            # DecimalType
            decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)),
            # ArrayType
            np.ndarray: (np.dtype("object"), ArrayType(StringType())),
            List[bytes]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.character]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[datetime.date]: (np.dtype("object"), ArrayType(DateType())),
            List[np.int8]: (np.dtype("object"), ArrayType(ByteType())),
            List[np.byte]: (np.dtype("object"), ArrayType(ByteType())),
            List[decimal.Decimal]:
            (np.dtype("object"), ArrayType(DecimalType(38, 18))),
            List[float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float32]: (np.dtype("object"), ArrayType(FloatType())),
            List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())),
            List[int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int64]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int16]: (np.dtype("object"), ArrayType(ShortType())),
            List[str]: (np.dtype("object"), ArrayType(StringType())),
            List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())),
            List[datetime.datetime]:
            (np.dtype("object"), ArrayType(TimestampType())),
            List[np.datetime64]:
            (np.dtype("object"), ArrayType(TimestampType())),
            # CategoricalDtype
            CategoricalDtype(categories=["a", "b", "c"]): (
                CategoricalDtype(categories=["a", "b", "c"]),
                LongType(),
            ),
        }

        for numpy_or_python_type, (dtype, spark_type) in type_mapper.items():
            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
            self.assertEqual(koalas_dtype(numpy_or_python_type),
                             (dtype, spark_type))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            as_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            as_spark_type(np.dtype("object"))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            koalas_dtype(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            koalas_dtype(np.dtype("object"))
Beispiel #18
0
def udf(f=None, returnType=DoubleType(), arg_type="pandas"):
    # Modified from
    # https://github.com/apache/spark/blob/master/python/pyspark/sql/functions.py
    # to add additional supports for Flint

    '''Creates a column expression representing a user defined
    function (UDF).

    This behaves the same as :meth:`~pyspark.sql.functions.udf` when
    used with a PySpark function, such as
    :meth:`~pyspark.sql.DataFrame.withColumn`.

    This can also be used with Flint functions, such as
    :meth:`ts.flint.TimeSeriesDataFrame.summarizeCycles`.

    This can be used to define a row user define function or
    a columnar user define function:

    1. Row udf

       A row udf takes one or more scalar values for each
       row, and returns a scalar value for that row.

       A :class:`~pyspark.sql.Column` object is needed to specifiy
       the input, for instance, ``df['v']``.

       Example:

           >>> @udf(DoubleType())
           >>> def plus_one(v):
           ...     return v+1
           >>> col = plus_one(df['v'])

    2. Pandas Columnar udf

       A pandas columnar udf takes one or more :class:`pandas.Series` or
       :class:`pandas.DataFrame` as input, and returns either a scalar
       value or a :class:`pandas.Series` as output.

       If the user function takes :class:`pandas.Series`, a
       :class:`~pyspark.sql.Column` is needed to specify the input,
       for instance, ``df['v']``.

       If the user function takes a :class:`pandas.DataFrame`, a
       :class:`~pyspark.sql.DataFrame` is needed to specify the input,
       for instance, ``df[['v', 'w']]``.

       Default return type is DoubleType.

       Example:

       Takes :class:`pandas.Series`, returns a scalar

           >>> @udf(DoubleType())
           >>> def weighted_mean(v, w):
           ...     return numpy.average(v, weights=w)
           >>> col = weighted_mean(df['v'], df['w'])

       Takes a :class:`pandas.DataFrame`, returns a scalar

           >>> @udf(DoubleType())
           >>> def weighted_mean(df):
           ...     return numpy.average(df.v, weighted=df.w)
           >>> col = weighted_mean(df[['v', 'w']])

       Takes a :class:`pandas.Series`, returns a
          :class:`pandas.Series`

           >>> @udf(DoubleType())
           >>> def percent_rank(v):
           ...     return v.rank(pct=True)
           >>> col = percent_rank(df['v'])

       Different functions take different types of udf. For instance,

       * :meth:`pyspark.sql.DataFrame.withColumn` takes a row udf
       * :meth:`ts.flint.TimeSeriesDataFrame.summarizeCycles` takes a
         columnar udf that returns a scalar value.

    3. Numpy Columnar udf

       Numpy columnar udf is similar to pandas columnar udf. The main difference is
       numpy udf expects the function input to be numpy data structure and types, i.e.,
       numpy.ndarray or numpy.flaat64. When a named input is expected, the input to
       the udf would be a python ordered dict from str to numpy.ndarray or numpy primitive
       type.

       Numpy columnar udf is faster than pandas columnar udf, particularly in summarizeWindows,
       where the overhead of creating pandas.Series and pandas.DataFrame for each window can be
       large. Therefore, user should try to use numpy columnar udf with summarizeWindows.

       Examples:

           >>> @udf(DoubleType(), arg_type='numpy')
           >>> def mean_udf(v):
           ...     # v is numpy.ndarray
           ...     return v.mean()
           >>> col = mean_udf(df['v'])

       .. seealso::

          :meth:`ts.flint.TimeSeriesDataFrame.summarizeCycles`
          :meth:`ts.flint.TimeSeriesDataFrame.addColumnsForCycles`
          :meth:`ts.flint.TimeSeriesDataFrame.summarizeIntervals`
          :meth:`ts.flint.TimeSeriesDataFrame.summarizeWindows`

    '''
    def _udf(f, returnType=DoubleType(), arg_type="pandas"):
        return FlintUserDefinedFunction(f, returnType, arg_type=arg_type)

    # decorator @udf, @udf(), @udf(dataType()) or @udf((dataType(), dataType()))
    if f is None or isinstance(f, (str, tuple, DataType)):
        # If DataType has been passed as a positional argument
        # for decorator use it as a returnType
        return_type = f or returnType
        return_type = _wrap_data_types(return_type)
        return functools.partial(_udf, returnType=return_type, arg_type=arg_type)
    else:
        return_type = _wrap_data_types(returnType)
        return _udf(f=f, returnType=return_type, arg_type=arg_type)
Beispiel #19
0
# structure from tweet
dtypes = StructType([
    StructField("created_at", TimestampType(), True),
    StructField("tweet_id", StringType(), False),
    StructField("tweet", StringType(), False),
    StructField("likes", DecimalType(38, 0), False),
    StructField("retweet_count", DecimalType(38, 0), False),
    StructField("source", StringType(), True),
    StructField("user_id", DecimalType(38, 0), False),
    StructField("user_name", StringType(), True),
    StructField("user_screen_name", StringType(), False),
    StructField("user_description", StringType(), True),
    StructField("user_join_date", TimestampType(), True),
    StructField("user_followers_count", DecimalType(38, 0), False),
    StructField("user_location", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("long", DoubleType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True),
    StructField("continent", StringType(), True),
    StructField("state", StringType(), True),
    StructField("state_code", StringType(), True),
    StructField("collected_at", TimestampType(), False)
])

if __name__ == "__main__":
    filePaths = [
        "Resources/hashtag_donaldtrump.csv", "Resources/hashtag_joebiden.csv"
    ]
    schemes = [dtypes, dtypes]
    spark = createSpark("tweet creator")
#----------------------------------------------------------------------------
## Main functionality
if __name__ == "__main__":

    main_config_file_filter = None
    errorCount = 0

    workflowStartTime = datetime.datetime.now()
    if len(sys.argv) > 1:
        main_config_file = sys.argv[1]
    if len(sys.argv) > 2:
        main_config_file_filter = sys.argv[2]

    spark.udf.register('udfConvertInt', convertInt, IntegerType())
    spark.udf.register('udfConvertDouble', convertDouble, DoubleType())
    spark.udf.register('udfConvertDatetime', convertDatetime, TimestampType())

    mainConfig = spark.read.load(main_config_file,
                                 format="csv",
                                 delimiter="|",
                                 header=True)

    #Opretaion|LoadType|threads|Server|Database|t|WhereClause|DeltaColumn|UniqueIdentifiers|PartitionColumn|TargetLocationRaw|TargetLocationCooked|TargetLocationTableSchema|HiveDatabase|HiveTable|Comments

    if (main_config_file_filter is not None):
        mainConfig = mainConfig.filter(main_config_file_filter)

    for row in mainConfig.collect():
        try:
            print(
Beispiel #21
0
            return False

        return True

    begin_re_pubmed = re.compile("^====")

    def is_text_pubmed(line):
        line = line.strip()
        if not line or begin_re_pubmed.match(line):
            return False

        return True

    schema = StructType([
        StructField("fullText", StringType(), True),
        StructField("category", DoubleType(), False)
    ])

    def load_article_wiki(category_name, category_id):
        text_file = spark.sparkContext.textFile("{}/*".format(category_name))
        return text_file.filter(is_text_wiki).map(
            lambda l: (l, float(category_id))).toDF(schema)

    def load_article_pubmed(category_name, category_id):
        text_file = spark.sparkContext.textFile("{}/*".format(category_name))
        return text_file.filter(is_text_wiki).map(
            lambda l: (l, float(category_id))).toDF(schema)

    bio_articles = load_article_pubmed("pubmed-AF-combine", 0)
    other_articles = load_article_wiki("enwiki", 1)
Beispiel #22
0
def noaa_s3_to_postgres(year):
    # first, handle the station info file
    station_schema = StructType([StructField('stationid', StringType(), True),\
                         StructField('latitude', DoubleType(), True),\
                         StructField('longitude', DoubleType(), True),\
                         StructField('elevation', DoubleType(), True),\
                         StructField('state', StringType(), True)])
    spark = SparkSession.builder.appName("Spark").config(
        "spark.driver.extraClassPath",
        "/home/ubuntu").config('spark.executor.extraClassPath',
                               '/home/ubuntu').getOrCreate()
    station_data = spark.read.csv(
        "s3a://yearly-weather-data/ghcnd-stations.csv",
        header=False,
        schema=station_schema)
    # now, deal with the annual file
    noaa_data_schema = StructType([StructField('stationid', StringType(), True),\
                                  StructField('obsdate', StringType(), True),\
                                  StructField('element', StringType(), True),\
                                  StructField('dataval', StringType(), True),\
                                  StructField('mflag', StringType(), True),\
                                  StructField('qflag', StringType(), True),\
                                  StructField('sflag', StringType(), True),\
                                  StructField('obstime', StringType(), True)])
    # first, load file from S3
    file_path = "s3a://yearly-weather-data/{}.csv".format(year)
    noaa_data = spark.read.csv(file_path,
                               header=False,
                               schema=noaa_data_schema)
    # clean data
    noaa_data = noaa_data.filter(noaa_data['element'].contains('TMAX'))
    noaa_data = noaa_data.filter(noaa_data['stationid'].contains('US'))
    noaa_data = noaa_data.filter(noaa_data['qflag'].isNull())
    # add month column
    noaa_data = noaa_data.withColumn(
        'month',
        substring('obsdate', 5, 2).cast(IntegerType()))
    # join to station data
    noaa_data = noaa_data.join(station_data, 'stationid',
                               'inner').drop('elevation', 'state', 'qflag',
                                             'obstime')
    # group data by month, longitude, latitude. then do average
    monthly_noaa_data = noaa_data.groupBy(
        'month', 'longitude',
        'latitude').agg(avg(col('dataval')).alias('dataval'))
    # now load noaa data to PostgreSQL
    newConnection = get_connection_by_config('database.ini',
                                             'postgresql_conn_data')
    cursor = newConnection.cursor()
    noaa_table = '''
                    DROP TABLE IF EXISTS noaa_{0}_avg;
                    CREATE TABLE noaa_{0}_avg (
                        month INTEGER, 
                        dataval INTEGER, 
                        latitude REAL,
                        longitude REAL
                    );
                    '''.format(year)
    cursor.execute(noaa_table)
    # add selected cols from dataframe to database
    insert_command = '''
        INSERT INTO noaa_{}_avg(month, dataval, latitude, longitude) VALUES %s
                    '''.format(year)
    noaa_arr = get_noaa_simple_array(monthly_noaa_data)
    execute_values(cursor, insert_command, noaa_arr, page_size=500)
    # add column with postgis point
    collComand = 'ALTER TABLE noaa_{}_avg ADD COLUMN geogcol geography(Point, 4326);'.format(
        year)
    cursor.execute(collComand)
    updateComand = 'UPDATE noaa_{}_avg SET geogcol = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326);'.format(
        year)
    cursor.execute(updateComand)
    # add index
    indexComand = 'CREATE INDEX noaa_{0}_geog_index ON noaa_{0}_avg (geogcol) ;'.format(
        year)
    cursor.execute(indexComand)
    # commit changes to database, close connection
    newConnection.commit()
    cursor.close()
    newConnection.close()
    print('Finished processing NOAA data year ' + year)
from zoo.pipeline.api.keras.layers import Dense, Input, Flatten
from zoo.pipeline.api.keras.models import *
from zoo.pipeline.api.net import *
from zoo.pipeline.nnframes import *

sc = init_nncontext("ImageTransferLearningExample")

model_path = "hdfs:///user/leelau/zoo/demo/bigdl_inception-v1_imagenet_0.4.0.model"
image_path = "hdfs:///user/leelau/zoo/demo/*/*"
imageDF = NNImageReader.readImages(image_path, sc)

getName = udf(
    lambda row: re.search(r'(cat|dog)\.([\d]*)\.jpg', row[0], re.IGNORECASE).
    group(0), StringType())
getLabel = udf(lambda name: 1.0
               if name.startswith('cat') else 2.0, DoubleType())

labelDF = imageDF.withColumn("name", getName(col("image"))) \
        .withColumn("label", getLabel(col('name')))
(trainingDF, validationDF) = labelDF.randomSplit([0.9, 0.1])
labelDF.select("name", "label").show(10)

# Fine-tune a pre-trained model
# We fine-tune a pre-trained model by removing the last few layers, freezing the first few layers, and adding some new layers.
transformer = ChainedPreprocessing([
    RowToImageFeature(),
    ImageResize(256, 256),
    ImageCenterCrop(224, 224),
    ImageChannelNormalize(123.0, 117.0, 104.0),
    ImageMatToTensor(),
    ImageFeatureToTensor()
Beispiel #24
0
def epa_s3_to_postgres(pollutant_name, year):
    pollutant_name_to_code = {'ozone': 44201, 'pm25': 88101, 'no2': 42602}
    epa_data_schema = StructType([StructField('statecode', StringType(), True),\
                                  StructField('countycode', StringType(), True),\
                                  StructField('sitecode', StringType(), True),\
                                  StructField('parameter_code', IntegerType(), True),\
                                  StructField('poc', IntegerType(), True),\
                                  StructField('latitude', DoubleType(), True),\
                                  StructField('longitude', DoubleType(), True),\
                                  StructField('datum', StringType(), True),\
                                  StructField('pollutantname', StringType(), True),\
                                  StructField('sample_duration', StringType(), True),\
                                  StructField('pollutant_standard', StringType(), True),\
                                  StructField('obsdate', StringType(), True), \
                                  StructField('unit', StringType(), True),\
                                  StructField('event_type', StringType(), True),\
                                  StructField('observation_count', IntegerType(), True),\
                                  StructField('observation_percent', DoubleType(), True),\
                                  StructField('dataval', DoubleType(), True),\
                                  StructField('first_max_value', DoubleType(), True),\
                                  StructField('first_max_hour', IntegerType(), True),\
                                  StructField('aqi', IntegerType(), True),\
                                  StructField('method_code', StringType(), True),\
                                  StructField('method_name', StringType(), True),\
                                  StructField('local_site_name', StringType(), True),\
                                  StructField('address', StringType(), True),\
                                  StructField('state_name', StringType(), True),\
                                  StructField('county_name', StringType(), True),\
                                  StructField('city_name', StringType(), True),\
                                  StructField('cbsa_name', StringType(), True),\
                                  StructField('date_of_last_change', DateType(), True)])

    spark = SparkSession.builder.appName("Spark").config(
        "spark.driver.extraClassPath",
        "/home/ubuntu").config('spark.executor.extraClassPath',
                               '/home/ubuntu').getOrCreate()
    # first, load file from S3
    file_path = 's3a://epa-aq-data/daily_{0}_{1}.csv'.format(
        pollutant_name_to_code[pollutant_name], year)
    epa_data = spark.read.csv(file_path, header=True, schema=epa_data_schema)
    print('got the epa spark data frame!')
    # add month, do averaging by month
    epa_data = epa_data.withColumn(
        'month',
        substring('obsdate', 6, 2).cast(IntegerType()))
    monthly_epa_data = epa_data.groupBy('month', 'longitude', 'latitude').agg(
        avg(col('dataval')).alias('dataval'))
    # now load epa data to PostgreSQL
    newConnection = get_connection_by_config('database.ini',
                                             'postgresql_conn_data')
    cursor = newConnection.cursor()
    pollutantTable = '''
                    DROP TABLE IF EXISTS {0}_{1}_avg;
                    CREATE TABLE {0}_{1}_avg (
                        month INTEGER, 
                        dataval REAL, 
                        latitude REAL,
                        longitude REAL 
                    );
                    '''.format(pollutant_name, year)
    cursor.execute(pollutantTable)
    print("made pollutant table in epageo database")
    # add selected cols from dataframe to database
    insert_command = '''
        INSERT INTO {0}_{1}_avg(month, dataval, latitude, longitude) VALUES %s
                    '''.format(pollutant_name, year)
    epa_arr = get_epa_simple_array(monthly_epa_data)
    execute_values(cursor, insert_command, epa_arr, page_size=500)
    # add column with postgis point
    collComand = 'ALTER TABLE {0}_{1}_avg ADD COLUMN geogcol geography(Point, 4326);'.format(
        pollutant_name, year)
    cursor.execute(collComand)
    updateComand = 'UPDATE {0}_{1}_avg SET geogcol = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326);'.format(
        pollutant_name, year)
    cursor.execute(updateComand)
    # add index
    indexComand = 'CREATE INDEX {0}_{1}_geog_index ON {0}_{1}_avg (geogcol) ;'.format(
        pollutant_name, year)
    cursor.execute(indexComand)
    # commit changes to database, close connection
    newConnection.commit()
    cursor.close()
    newConnection.close()
    print('Finished processing epa data pollutant {0} year {1}'.format(
        pollutant_name, year))
Beispiel #25
0
if __name__ == "__main__":
    sc = SparkContext()
    spark = SparkSession.builder.master("local").appName("Word Count").config(
        "spark.some.config.option", "some-value").getOrCreate()

    num_trees = 50
    max_depth = 25

    df_train, df_test = get_dataframe(NUM_FEA)

    # Random Forest Classification
    ##########################################################################
    rf = RandomForestClassifier(numTrees=num_trees, maxDepth=max_depth)
    model = rf.fit(
        df_train.withColumn("label", df_train["label"].cast(DoubleType())))

    pred = model.transform(df_test)
    pred = pred.withColumn("prediction", pred["prediction"].cast("int"))

    y_test = pred.select(
        "docid",
        "prediction").rdd.map(tuple).sortByKey().map(lambda x: x[1]).collect()

    # Accuracy
    # rdd_ytest = sc.textFile('gs://uga-dsp/project1/files/y_small_test.txt')
    # accuracy = cal_accuracy(rdd_ytest.collect(), y_test)
    # print('Testing Accuracy: %.2f %%' % (accuracy*100))
    # print('**********************************************')

    # Output file
print("Fitting for Submittal")
predictions = pipeline_model.transform(df)
predictions.select("MachineIdentifier", "probability",
                   "prediction").show(truncate=False)

print("Creating CSV for Submittal")


# Silly workaround for extracting an element from a dense or sparse vector. Probability column is a vector, with probs for each label
# https://stackoverflow.com/questions/39555864/how-to-access-element-of-a-vectorudt-column-in-a-spark-dataframe
def vector_item_(vector_column, index):
    try:
        return float(vector_column[index])
    except ValueError:
        return None


vector_item = F.udf(vector_item_, DoubleType())

df_submit = predictions.withColumn("Label_0",
                                   vector_item("probability", F.lit(0)))
df_submit = df_submit.withColumn("Label_1",
                                 vector_item("probability", F.lit(1)))
df_submit = df_submit.withColumn("HasDetections", df_submit.Label_1)
df_submit = df_submit.select("MachineIdentifier", "HasDetections")

# Yet another workaround to write to a CSV file
df_submit.coalesce(1).toPandas().to_csv(csv_path, header=True, index=False)

print("Total rows written to file: {0}".format(df_submit.count()))
Beispiel #27
0
nflSubset = nflDF[columns]
nflSubset = nflSubset.where((nflSubset['PlayType'] == 'Pass')
                            | (nflSubset['PlayType'] == 'Run'))

nflClean = nflSubset.dropna()

# COMMAND ----------

intColumns = [
    "down", "TimeSecs", "PlayTimeDiff", "yrdln", "yrdline100", "PosTeamScore",
    "DefTeamScore", "AbsScoreDiff"
]

for col in intColumns:
    nflClean = nflClean.withColumn(col, nflClean[col].cast(DoubleType()))

nflClean = nflClean.na.fill(0)

# COMMAND ----------

labelIndexer = StringIndexer(inputCol="PlayType",
                             outputCol="indexedLabel").fit(nflClean)

# Converting all categorical variables into factor indexes
# All string values must be in a numerical format, unlike R, you are not able to create STRING "Factor" levels
PosTeamIndexer = StringIndexer(inputCol="posteam", outputCol="indexedPosTeam")
DefTeamIndexer = StringIndexer(inputCol="DefensiveTeam",
                               outputCol="indexedDefTeam")
HomeTeamIndexer = StringIndexer(inputCol="HomeTeam",
                                outputCol="indexedHomeTeam")
  CREATE TABLE customer_counts
  USING DELTA
  LOCATION '{}'
""".format(CustomerCountsPath))

# COMMAND ----------

from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType

inputSchema = StructType([
    StructField("InvoiceNo", IntegerType(), True),
    StructField("StockCode", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("InvoiceDate", StringType(), True),
    StructField("UnitPrice", DoubleType(), True),
    StructField("CustomerID", IntegerType(), True),
    StructField("Country", StringType(), True)
])

# COMMAND ----------

newDataPath = "/mnt/training/online_retail/outdoor-products/outdoor-products-small.csv"

spark.sql("DROP TABLE IF EXISTS new_customer_counts")
newDataDF = (spark.read.option("header",
                               "true").schema(inputSchema).csv(newDataPath))

(newDataDF.groupBy("CustomerID", "Country").count().withColumnRenamed(
    "count", "total_orders").write.saveAsTable("new_customer_counts"))
Beispiel #29
0
print('QUANTIDADE DE REGISTROS: {}'.format(df_clima.count()))
df_clima.printSchema()

# Efetua a leitura do csv de parametros de ra
print('EFETUANDO LEITURA DO CSV DE PARAMETROS DE Ra')
df_parametro_ra = spark.read.csv(
    'C:/projeto/TCC-PUPUNHA/datasets/parametroRa.csv', header=True, sep=';')
print(df_parametro_ra.columns)
print('QUANTIDADE DE REGISTROS: {}'.format(df_parametro_ra.count()))
df_parametro_ra.printSchema()

fields_list = ['prcp', 'temp', 'tmax', 'tmin']
print('ALTERANDO AS VARIAVEIS {} DO DATAFRAME DF_CLIMA PARA DOUBLE'.format(
    fields_list))
for name in fields_list:
    df_clima = df_clima.withColumn(name, df_clima[name].cast(DoubleType()))

print('ALTERANDO AS VARIAVEIS {} DO DATAFRAME DF_PARAMETRO_RA PARA DOUBLE'.
      format(df_parametro_ra.columns))
for name in df_parametro_ra.columns:
    df_parametro_ra = df_parametro_ra.withColumn(
        name, df_parametro_ra[name].cast(DoubleType()))

cidades_list = [
    'Pariquera-Açu', 'Barra do Turvo', 'Itariri', 'Cananéia',
    'Pedro de Toledo', 'Iporanga', 'Eldorado', 'Miracatu', 'Cajati',
    'Sete Barras', 'Juquiá', 'Jacupiranga', 'Ilha Comprida', 'Registro',
    'Iguape'
]
# seleciona as cidades presentes na lista 'cidades'
print('SELECIONANDO CIDADES PRESENTES NO VALE DO RIBEIRA')
Beispiel #30
0
 def __init__(self):
     """
     Each element in attributes stands for some data that the holoclean
      needs to create:
         id : unique id for the dataset and it will be used in
          registering and retrieving data
         Init : initial data that get to the database from a file
          that user gives
         C_clean : table with index of clean cells
         C_dk : table of indices that we don't know if they
          are noisy or clean
         C_dk_temp : table of indices that we don't know if they
          are noisy or clean based on the dcs
         C_dk_temp_null : table of indices that we don't know if they
          are noisy or clean based on the null cells
         T1_attributes:  attributes of the first tuple in dc grounding
         T2_attributes:  attributes of the second tuple in dc grounding
         Possible_values: table of all possible values for the
          do not know cells
         Observed_Possible_values_clean : table with the observed
          values for the clean cells
         Observed_Possible_values_dk : table with the observed
          values for the do not know cells
         C_clean_flat: table for the clean cells that are
          flatted on three columns (index, attribute, and value)
         C_dk_flat: table for the dk cells that are flatted
          on three columns (index, attribute, and value)
         Kij_lookup: table with the cardinality of the
          domain for each cell
         Init_join: self join of init table
         Map_schema: table with the schema of the Init table
         Init_flat_join: self join of C_clean_flat table
         Init_flat_join_dk: self join of C_dk_flat table
         Feature_id_map: table that maps each feature to a number
         Sources: table that maps each source to a number
         Sources_temp: temporary table for saving the sources
         Attribute_temp: temporary table for saving the attributes
         Dimensions_clean: table with the dimensions for the
          X tensor for training
         Dimensions_dk: table with the dimensions for the
          X tensor for learning
         Inferred_values: table with the inferred values
         Repaired_dataset: dataset table after we apply
          repairs to initial data
         Correct: table with the correct values for our dataset
         Correct_flat: table with the correct data that
          are flatted on three columns (index, attribute, and value)
         Feature: table with feature value for each random variable
         and assigned value
     """
     # holds casting information to cast from pyspark datatype to python types
     self.type_dict = {
         IntegerType().simpleString(): int,
         StringType().simpleString(): unicode,
         DoubleType().simpleString(): float,
         LongType().simpleString(): int,
         FloatType().simpleString(): float
     }
     self.sql_type_dict = {
         IntegerType().simpleString(): 'INTEGER',
         StringType().simpleString(): 'VARCHAR(255)',
         DoubleType().simpleString(): 'DOUBLE PRECISION',
         LongType().simpleString(): 'BIGINT',
         FloatType().simpleString(): 'REAL'
     }
     self.attribute = {}
     self.schema = ""
     self.dataset_tables_specific_name = []
     self.dataset_id = self._id_generator()
     self.attributes = {
         'id': [],
         'Init': [],
         'C_clean': [],
         'C_dk': [],
         'C_dk_temp': [],
         'C_dk_temp_null': [],
         'T1_attributes': [],
         'T2_attributes': [],
         'Possible_values':
             StructType([
                 StructField("vid", IntegerType(), True),
                 StructField("tid", IntegerType(), False),
                 StructField("attr_name", StringType(), False),
                 StructField("attr_val", StringType(), False),
                 StructField("observed", IntegerType(), False),
                 StructField("domain_id", IntegerType(), True)
             ]),
         'Observed_Possible_values_clean': [],
         'Observed_Possible_values_dk': [],
         'C_clean_flat':
             StructType([
                 StructField("tid", IntegerType(), False),
                 StructField("attribute", StringType(), False),
                 StructField("value", StringType(), True)
             ]),
         'C_dk_flat':
             StructType([
                 StructField("tid", IntegerType(), False),
                 StructField("attribute", StringType(), False),
                 StructField("value", StringType(), True)
             ]),
         'Kij_lookup':
             StructType([
                 StructField("vid", IntegerType(), True),
                 StructField("tid", IntegerType(), False),
                 StructField("attr_name", StringType(), False),
                 StructField("k_ij", IntegerType(), False),
             ]),
         'Init_join': [],
         'Map_schema':
             StructType([
                 StructField("attr_id", IntegerType(), False),
                 StructField("attribute", StringType(), True)
             ]),
         'Init_flat_join_dk': [],
         'Init_flat_join': [],
         'Feature_id_map': StructType([
                 StructField("feature_ind", IntegerType(), True),
                 StructField("attribute", StringType(), False),
                 StructField("value", StringType(), False),
                 StructField("Type", StringType(), False),
             ]),
         'Sources': [],
         'Sources_temp': [],
         'Attribute_temp': [],
         'Dimensions_clean': [],
         'Dimensions_dk': [],
         'Inferred_values': StructType([
                 StructField("probability", DoubleType(), False),
                 StructField("vid", IntegerType(), False),
                 StructField("attr_name", StringType(), False),
                 StructField("attr_val", StringType(), False),
                 StructField("tid", IntegerType(), False),
                 StructField("domain_id", IntegerType(), False)
             ]),
         'Repaired_dataset': [],
         'Correct': [],
         'Correct_flat': [],
         'Feature':
             StructType([
                 StructField("vid", IntegerType(), False),
                 StructField("assigned_val", IntegerType(), False),
                 StructField("feature", IntegerType(), False),
                 StructField("count", IntegerType(), False)
             ])}