Ejemplo n.º 1
0
                model_path = "hdfs://VM10-1-0-14:9000/classifier/"+model_name+str(i)
				updated_model = model_name+str(i)
				data_path = model_path+"/data/part-r*"
				metadata_path = model_path+"/metadata/part-00000"
				if(patherror(data_path) == False and patherror(metadata_path) == False):
					break
			
			#load model classifier
            model = NaiveBayesModel.load(sc, model_path)

			start = time.time()
            reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)
			
			Words = Row('label', 'words')
			words = reviews.map(lambda r: Words(*r))
			words_df = spark.createDataFrame(words)
			
			#review tokenization
			token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True)
			token_filtered = token.transform(words_df)
			
			#stopwords elimination
			remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False)
			stopwords_filtered = remover.transform(token_filtered)

			prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])	

			#tf-idf calculation
			tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True))
			idf = IDF().fit(tf)
			tfidf = idf.transform(tf)
Ejemplo n.º 2
0
Archivo: coco.py Proyecto: eto-ai/rikai
def convert(
    spark: SparkSession,
    dataset_root: str,
    limit: int = 0,
    asset_dir: Optional[str] = None,
) -> DataFrame:
    """Convert a Coco Dataset into Rikai dataset.

    This function expects the COCO datasets are stored in directory with the
    following structure:

    - dataset
        - annotations
          - captions_train2017.json
          - instances_train2017.json
          - ...
        - train2017
        - val2017
        - test2017

    Parameters
    ----------
    spark : SparkSession
        A live spark session
    dataset_root : str
        The directory of dataset
    limit : int, optional
        The number of images of each split to be converted.
    asset_dir : str, optional
        The asset directory to store images, can be a s3 directory.

    Return
    ------
    DataFrame
        Returns a Spark DataFrame
    """
    train_json = os.path.join(dataset_root, "annotations",
                              "instances_train2017.json")
    val_json = os.path.join(dataset_root, "annotations",
                            "instances_val2017.json")

    categories = load_categories(train_json)

    examples = []
    for split, anno_file in zip(["train", "val"], [train_json, val_json]):
        coco = COCO(annotation_file=anno_file)
        # Coco has native dependencies, so we do not distributed them
        # to the workers.
        image_ids = coco.imgs
        if limit > 0:
            image_ids = islice(image_ids, limit)
        for image_id in image_ids:
            ann_id = coco.getAnnIds(imgIds=image_id)
            annotations = coco.loadAnns(ann_id)
            annos = []
            for ann in annotations:
                bbox = Box2d.from_top_left(*ann["bbox"])
                annos.append({
                    "category_id":
                    ann["category_id"],
                    "category_text":
                    categories[ann["category_id"]]["name"],
                    "bbox":
                    bbox,
                    "area":
                    float(ann["area"]),
                })
            image_payload = coco.loadImgs(ids=image_id)[0]
            example = {
                "image_id":
                image_id,
                "annotations":
                annos,
                "image":
                Image(
                    os.path.abspath(
                        os.path.join(
                            dataset_root,
                            "{}2017".format(split),
                            image_payload["file_name"],
                        ))),
                "split":
                split,
            }
            examples.append(example)

    schema = StructType([
        StructField("image_id", LongType(), False),
        StructField(
            "annotations",
            ArrayType(
                StructType([
                    StructField("category_id", IntegerType()),
                    StructField("category_text", StringType()),
                    StructField("area", FloatType()),
                    StructField("bbox", Box2dType()),
                ])),
            False,
        ),
        StructField("image", ImageType(), False),
        StructField("split", StringType(), False),
    ])
    df = spark.createDataFrame(examples, schema=schema)
    if asset_dir:
        asset_dir = asset_dir if asset_dir.endswith("/") else asset_dir + "/"
        print("ASSET DIR: ", asset_dir)
        df = df.withColumn("image", image_copy(col("image"), lit(asset_dir)))
    return df
Ejemplo n.º 3
0
class DeltaTableTests(PySparkTestCase):
    def setUp(self):
        super(DeltaTableTests, self).setUp()
        self.sqlContext = SQLContext(self.sc)
        self.spark = SparkSession(self.sc)
        self.tempPath = tempfile.mkdtemp()
        self.tempFile = os.path.join(self.tempPath, "tempFile")

    def tearDown(self):
        self.spark.stop()
        shutil.rmtree(self.tempPath)
        super(DeltaTableTests, self).tearDown()

    def test_forPath(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        dt = DeltaTable.forPath(self.spark, self.tempFile).toDF()
        self.__checkAnswer(dt, [('a', 1), ('b', 2), ('c', 3)])

    def test_alias_and_toDF(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        dt = DeltaTable.forPath(self.spark, self.tempFile).toDF()
        self.__checkAnswer(
            dt.alias("myTable").select('myTable.key', 'myTable.value'),
            [('a', 1), ('b', 2), ('c', 3)])

    def test_history(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        self.__overwriteDeltaTable([('a', 3), ('b', 2), ('c', 1)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)
        operations = dt.history().select('operation')
        self.__checkAnswer(
            operations, [Row("WRITE"), Row("WRITE")],
            StructType([StructField("operation", StringType(), True)]))

        lastMode = dt.history(1).select('operationParameters.mode')
        self.__checkAnswer(
            lastMode, [Row("Overwrite")],
            StructType(
                [StructField("operationParameters.mode", StringType(), True)]))

    def test_vacuum(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)
        self.__createFile('abc.txt', 'abcde')
        self.__createFile('bac.txt', 'abcdf')
        self.assertEqual(True, self.__checkFileExists('abc.txt'))
        dt.vacuum()  # will not delete files as default retention is used.

        self.assertEqual(True, self.__checkFileExists('bac.txt'))
        retentionConf = "spark.databricks.delta.retentionDurationCheck.enabled"
        self.spark.conf.set(retentionConf, "false")
        dt.vacuum(0.0)
        self.spark.conf.set(retentionConf, "true")
        self.assertEqual(False, self.__checkFileExists('bac.txt'))
        self.assertEqual(False, self.__checkFileExists('abc.txt'))

    def test_convertToDelta(self):
        df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)],
                                        ["key", "value"])
        df.write.format("parquet").save(self.tempFile)
        self.tempFile2 = self.tempFile + "_"
        dt = DeltaTable.convertToDelta(self.spark,
                                       "parquet.`" + self.tempFile + "`")
        self.__checkAnswer(
            self.spark.read.format("delta").load(self.tempFile), [('a', 1),
                                                                  ('b', 2),
                                                                  ('c', 3)])

        # test if convert to delta with partition columns work
        df.write.partitionBy("value").format("parquet").save(self.tempFile2)
        schema = StructType()
        schema.add("value", IntegerType(), True)
        dt = DeltaTable.convertToDelta(self.spark,
                                       "parquet.`" + self.tempFile2 + "`",
                                       schema)
        self.__checkAnswer(
            self.spark.read.format("delta").load(self.tempFile2), [('a', 1),
                                                                   ('b', 2),
                                                                   ('c', 3)])

    def __checkAnswer(self, df, expectedAnswer, schema=["key", "value"]):
        if not expectedAnswer:
            self.assertEqual(df.count(), 0)
            return
        expectedDF = self.spark.createDataFrame(expectedAnswer, schema)
        self.assertEqual(df.count(), expectedDF.count())
        self.assertEqual(len(df.columns), len(expectedDF.columns))
        self.assertEqual([], df.subtract(expectedDF).take(1))
        self.assertEqual([], expectedDF.subtract(df).take(1))

    def __writeDeltaTable(self, datalist):
        df = self.spark.createDataFrame(datalist, ["key", "value"])
        df.write.format("delta").save(self.tempFile)

    def __overwriteDeltaTable(self, datalist):
        df = self.spark.createDataFrame(datalist, ["key", "value"])
        df.write.format("delta").mode("overwrite").save(self.tempFile)

    def __createFile(self, fileName, content):
        with open(os.path.join(self.tempFile, fileName), 'w') as f:
            f.write(content)

    def __checkFileExists(self, fileName):
        return os.path.exists(os.path.join(self.tempFile, fileName))
Ejemplo n.º 4
0
    line = [int(ele) for ele in line]

    return ((line[0], line[1:]))


#item_user_mat=sc.textFile("/Users/sohinimitra/Documents/itemusermat").map(lambda x: x.split(" ")).map(lambda x: [[x[0],x[1:]] for y in x])
item_user_mat = sc.textFile("/Users/sohinimitra/Documents/itemusermat").map(
    lambda x: x.split(" "))
item_user_mat = item_user_mat.map(getKeyValue)

ratings = item_user_mat.map(lambda x: x[0]).zipWithIndex().map(lambda x:
                                                               (x[1], x[0]))

data = [(Vectors.dense(x[1]), ) for x in item_user_mat.collect()]
item_user_mat_df = spark.createDataFrame(data, ["features"])

kmeans = KMeans(k=10, seed=1)
model = kmeans.fit(item_user_mat_df)

transformed = model.transform(item_user_mat_df).select("features",
                                                       "prediction")
transformed_with_index = transformed.rdd.zipWithIndex()
rows = transformed_with_index.collect()
prediction_with_index = sc.parallelize(rows).map(lambda x:
                                                 (x[1], x[0].prediction))

ratingsPrediction = ratings.join(prediction_with_index).map(lambda x: x[1])

movie = sc.textFile("/Users/sohinimitra/Documents/movies.dat").map(
    lambda x: x.split("::")).map(lambda x: (int(x[0]), (x[1], x[2])))
Ejemplo n.º 5
0
spark = SparkSession(sc)

#Load dataset file as RDD
rdd = sc.textFile("/user/spark/airfoil.txt")
rdd = rdd.map(lambda x: x.split('\t'))
rdd = rdd.map(lambda x: [
    float(x[0]),
    float(x[1]),
    float(x[2]),
    float(x[3]),
    float(x[4]),
    float(x[5])
])

#Create dataframe for ML model
df = spark.createDataFrame(
    rdd, ["frequency", "angle", "chord", "velocity", "suction", "pressure"])
data = df.rdd.map(lambda x: (DenseVector(x[:-1]), x[-1]))
df = spark.createDataFrame(data, ["features", "label"])

#Feature scaling
standardScaler = StandardScaler(inputCol="features",
                                outputCol="features_scaled")
scaler = standardScaler.fit(df)
scaled_df = scaler.transform(df)

#Split data into training and test
train_data, test_data = scaled_df.randomSplit([.7, .3], seed=1234)
train_data = train_data.select("features_scaled", "label")
test_data = test_data.select("features_scaled", "label")
train_data = train_data.withColumnRenamed("features_scaled", "features")
test_data = test_data.withColumnRenamed("features_scaled", "features")
Ejemplo n.º 6
0
# In[116]:

from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col


def toCSV(_, records):
    for x in records:
        if x[0][1] == 0:
            product = '"{}"'.format(product)
        yield ','.join(
            (product, year, str(total), str(companies), str(top_percent)))


rdd = sc.parallelize(countsPerNeighborhood)
df = spark.createDataFrame(rdd)
df1 = Window.partitionBy(df[1][1]).orderBy(df[1][0].desc())

df = df.select('*', rank().over(df1).alias('rank')).filter(col('rank') <= 3)

# In[122]:

from pyspark.sql import functions as f
from pyspark.sql import types as t


def newCols(x):
    return names[x]


finaldf = f.udf(newCols, t.StringType())
Ejemplo n.º 7
0
def test_multiple_join(
    spark: SparkSession,
    composite_entity_schema: StructType,
    customer_feature_schema: StructType,
    driver_feature_schema: StructType,
):

    entity_data = [
        (1001, 8001, datetime(year=2020, month=9, day=2)),
        (1001, 8002, datetime(year=2020, month=9, day=2)),
        (2001, 8002, datetime(year=2020, month=9, day=3)),
    ]
    entity_df = spark.createDataFrame(
        spark.sparkContext.parallelize(entity_data), composite_entity_schema)

    customer_table_data = [
        (
            1001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            100.0,
        ),
        (
            2001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            200.0,
        ),
    ]
    customer_table_df = spark.createDataFrame(
        spark.sparkContext.parallelize(customer_table_data),
        customer_feature_schema)
    customer_table = FeatureTable(
        name="transactions",
        features=[Field("daily_transactions", "double")],
        entities=[Field("customer_id", "int32")],
        max_age=86400,
    )
    customer_table_df = filter_feature_table_by_time_range(
        customer_table_df,
        customer_table,
        "event_timestamp",
        entity_df,
        "event_timestamp",
    )

    driver_table_data = [
        (
            8001,
            datetime(year=2020, month=8, day=31),
            datetime(year=2020, month=8, day=31),
            200,
        ),
        (
            8001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            300,
        ),
        (
            8002,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            600,
        ),
        (
            8002,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=2),
            500,
        ),
    ]
    driver_table_df = spark.createDataFrame(
        spark.sparkContext.parallelize(driver_table_data),
        driver_feature_schema)

    driver_table = FeatureTable(
        name="bookings",
        features=[Field("completed_bookings", "int32")],
        entities=[Field("driver_id", "int32")],
        max_age=7 * 86400,
    )
    driver_table_df = filter_feature_table_by_time_range(
        driver_table_df,
        driver_table,
        "event_timestamp",
        entity_df,
        "event_timestamp",
    )
    joined_df = join_entity_to_feature_tables(
        entity_df,
        "event_timestamp",
        [customer_table_df, driver_table_df],
        [customer_table, driver_table],
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__daily_transactions", FloatType()),
        StructField("bookings__completed_bookings", IntegerType()),
    ])

    expected_joined_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=2),
            100.0,
            300,
        ),
        (
            1001,
            8002,
            datetime(year=2020, month=9, day=2),
            100.0,
            500,
        ),
        (
            2001,
            8002,
            datetime(year=2020, month=9, day=3),
            None,
            500,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql import SQLContext, SparkSession

sc = SparkContext("local", "Features - OneHotEncoder")
spark = SparkSession(sc)
        
df = spark.createDataFrame([
    (0, "a"),
    (1, "b"),
    (2, "c"),
    (3, "a"),
    (4, "a"),
    (5, "c")
], ["id", "category"])

stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = stringIndexer.fit(df)
indexed = model.transform(df)

encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
#encoder.setDropLast(False)
encoded = encoder.transform(indexed)
encoded.show()

spark.stop()
Ejemplo n.º 9
0
def test_join_with_composite_entity(
    spark: SparkSession,
    composite_entity_schema: StructType,
    rating_feature_schema: StructType,
):
    entity_data = [
        (1001, 8001, datetime(year=2020, month=9, day=1)),
        (1001, 8002, datetime(year=2020, month=9, day=3)),
        (1001, 8003, datetime(year=2020, month=9, day=1)),
        (2001, 8001, datetime(year=2020, month=9, day=2)),
    ]
    entity_df = spark.createDataFrame(
        spark.sparkContext.parallelize(entity_data), composite_entity_schema)

    feature_table_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            3.0,
            5.0,
        ),
        (
            1001,
            8002,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            4.0,
            3.0,
        ),
        (
            2001,
            8001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            4.0,
            4.5,
        ),
    ]
    feature_table_df = spark.createDataFrame(
        spark.sparkContext.parallelize(feature_table_data),
        rating_feature_schema,
    )
    feature_table = FeatureTable(
        name="ratings",
        features=[
            Field("customer_rating", "double"),
            Field("driver_rating", "double")
        ],
        entities=[Field("customer_id", "int32"),
                  Field("driver_id", "int32")],
        max_age=86400,
    )
    feature_table_df = filter_feature_table_by_time_range(
        feature_table_df,
        feature_table,
        "event_timestamp",
        entity_df,
        "event_timestamp",
    )
    joined_df = as_of_join(
        entity_df,
        "event_timestamp",
        feature_table_df,
        feature_table,
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("ratings__customer_rating", FloatType()),
        StructField("ratings__driver_rating", FloatType()),
    ])
    expected_joined_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=1),
            3.0,
            5.0,
        ),
        (1001, 8002, datetime(year=2020, month=9, day=3), None, None),
        (1001, 8003, datetime(year=2020, month=9, day=1), None, None),
        (
            2001,
            8001,
            datetime(year=2020, month=9, day=2),
            4.0,
            4.5,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
Ejemplo n.º 10
0
def test_select_subset_of_columns_as_entity_primary_keys(
    spark: SparkSession,
    composite_entity_schema: StructType,
    customer_feature_schema: StructType,
):
    entity_data = [
        (1001, 8001, datetime(year=2020, month=9, day=2)),
        (2001, 8002, datetime(year=2020, month=9, day=2)),
    ]
    entity_df = spark.createDataFrame(
        spark.sparkContext.parallelize(entity_data), composite_entity_schema)

    feature_table_data = [
        (
            1001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=2),
            100.0,
        ),
        (
            2001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            400.0,
        ),
    ]
    feature_table_df = spark.createDataFrame(
        spark.sparkContext.parallelize(feature_table_data),
        customer_feature_schema)
    feature_table = FeatureTable(
        name="transactions",
        features=[Field("daily_transactions", "double")],
        entities=[Field("customer_id", "int32")],
        max_age=86400,
    )
    feature_table_df = filter_feature_table_by_time_range(
        feature_table_df,
        feature_table,
        "event_timestamp",
        entity_df,
        "event_timestamp",
    )
    joined_df = as_of_join(
        entity_df,
        "event_timestamp",
        feature_table_df,
        feature_table,
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__daily_transactions", FloatType()),
    ])
    expected_joined_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=2),
            100.0,
        ),
        (
            2001,
            8002,
            datetime(year=2020, month=9, day=2),
            400.0,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
Ejemplo n.º 11
0
                             encoding='gb18030')
        yjjd = yjjd.join(app, yjjd.realAppID == app.app_id,
                         how='inner').select('sn', 'realAppID', 'ch_name',
                                             'group_ch')
        yjjd = yjjd.distinct().dropna()
        yjjd_count = yjjd.count()
        shipin = yjjd.filter(yjjd.group_ch == '影音试听').count()
        gouwu = yjjd.filter(yjjd.group_ch == '网络购物').count()
        youxi = yjjd.filter(yjjd.group_ch == '网络游戏').count()
        ddd = [{
            'sn':
            '%s' % (n),
            'qingxu':
            0 if yjjd_count == 0 else
            (yjjd_count - shipin - gouwu - youxi) / yjjd_count
        }]
        ddd = spark.createDataFrame(ddd).select(
            'sn',
            bround('qingxu', 2).alias('qingxu'))
        ddd = ddd.withColumnRenamed('sn', 'sn1')
        df = df.join(ddd, df.sn == ddd.sn1, how='inner')
        df = df.select('time', 'sn', 'count', 'avg_ht', 'avg_bi', 'std_ht',
                       'std_bi', 'qingxu')
        df = df.repartition(1)
        df.write.csv('/user/maxnet/ian/corp_index_1/%s_%s' % (n, i),
                     header=True,
                     compression='gzip',
                     mode='overwrite')
    except:
        pass
def test_auto_mapper_concat_multiple_items_structs_different_elements_with_schema(
    spark_session: SparkSession, ) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, None, "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df: DataFrame = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    schema: StructType = StructType([
        StructField("id", StringType(), True),
        StructField("c", StringType(), True),
        StructField("b", StringType(), True),
    ])

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(dst2=AutoMapperList(
        [
            AutoMapperDataTypeComplexBase(id_=A.column("first_name"),
                                          b=A.column("last_name")),
        ],
        children_schema=schema,
    ).concat(
        AutoMapperList(
            [
                AutoMapperDataTypeComplexBase(id_=A.column("first_name"),
                                              c=A.column("last_name")),
            ],
            children_schema=schema,
        )))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    struct1 = struct(
        col("b.first_name").alias("id"),
        lit(None).alias("c"),
        col("b.last_name").alias("b"),
    )
    struct2 = struct(
        col("b.first_name").alias("id"),
        col("b.last_name").alias("c"),
        lit(None).alias("b"),
    )
    array1 = when(
        array(struct1).isNotNull(),
        filter(coalesce(array(struct1), array()), lambda x: x.isNotNull()),
    )
    array2 = when(
        array(struct2).isNotNull(),
        filter(coalesce(array(struct2), array()), lambda x: x.isNotNull()),
    )
    assert_compare_expressions(sql_expressions["dst2"],
                               concat(array1, array2).alias("dst2"))
    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            [0] == "Imran")
    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            [2] == "Qureshi")
    assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0]
            [0] == "Michael")
    assert (
        result_df.where("member_id == 2").select("dst2").collect()[0][0][0][1]
        is None)
Ejemplo n.º 13
0
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark import SQLContext
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import col, udf, lag, date_add, explode, lit, concat, unix_timestamp, sum, abs
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

sc = SparkContext(appName="MyFirstApp4_Task_task2")
spark = SparkSession(sc)


df_node18=spark.read.format("parquet").load(path="hdfs://namenode:9000/example4/test.parquet")
model_node21=CrossValidatorModel.load("hdfs://namenode:9000/example4/model_2/")
model_node19=PipelineModel.load("hdfs://namenode:9000/example4/model_1/")
df_node20=model_node19.transform(df_node18)
df_node22=model_node21.transform(df_node20)

evaluator_node23 = MulticlassClassificationEvaluator(labelCol="indexedSurvived", predictionCol="prediction", metricName="accuracy")
score_node23=evaluator_node23.evaluate(df_node22)
df_node23= spark.createDataFrame([(score_node23,)], ["score"])

df_node23.write.format("csv").save(path="hdfs://namenode:9000/example4/EvalResult3.csv")
sqlContext = SQLContext(sc)
df = spark.read.csv('file:////home/ubuntu/ys-180326/Dataset75.csv',
                    header=True)
data = df.rdd.map(list)
print(data.first())

score = data.map(lambda s: 1.0
                 if s[1].isdigit() and float(s[1]) == 1.0 else 0.0)
comment = data.map(lambda s: s[3])
split_neg_data2 = score.zip(comment)
tranform_data = split_neg_data2.map(
    lambda p: (p[0], p[1]))  #.toDF()#.withColumnRenamed('_1','label')
#tranform_data.show()
#sentenceData = spark.createDataFrame([(0, "I heard about Spark and I love Spark"),(0, "I wish Java could use case classes"),(1, "Logistic regression models are neat")]).toDF("label", "sentence")

sentenceData = spark.createDataFrame(tranform_data, ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

#计算TF-IDF
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=3000)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show()
forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit(
    rescaledData).transform(rescaledData)
(trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0)
Ejemplo n.º 15
0
def test_historical_feature_retrieval_with_mapping(spark: SparkSession):
    test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data")
    entity_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path":
            f"file://{path.join(test_data_dir,  'column_mapping_test_entity.csv')}",
            "event_timestamp_column": "event_timestamp",
            "field_mapping": {
                "customer_id": "id"
            },
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    booking_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path":
            f"file://{path.join(test_data_dir,  'column_mapping_test_feature.csv')}",
            "event_timestamp_column": "datetime",
            "created_timestamp_column": "created_datetime",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    booking_table = {
        "name": "bookings",
        "entities": [{
            "name": "customer_id",
            "type": "int32"
        }],
        "features": [{
            "name": "total_bookings",
            "type": "int32"
        }],
        "max_age": 86400,
    }

    joined_df = retrieve_historical_features(
        spark,
        entity_source,
        [booking_source],
        [booking_table],
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("bookings__total_bookings", IntegerType()),
    ])

    expected_joined_data = [
        (1001, datetime(year=2020, month=9, day=2), 200),
        (1001, datetime(year=2020, month=9, day=3), 200),
        (2001, datetime(year=2020, month=9, day=4), 600),
        (2001, datetime(year=2020, month=9, day=4), 600),
        (3001, datetime(year=2020, month=9, day=4), 700),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
                    default="predictions")

args = parser.parse_args()
print("args:", args)

print("{0} ===== Start".format(datetime.now().isoformat()))

if args.format == "tfr":
    df = dfutil.loadTFRecords(sc, args.images)
elif args.format == "csv":
    images = sc.textFile(
        args.images).map(lambda ln: [int(x) for x in ln.split(',')])
    labels = sc.textFile(
        args.labels).map(lambda ln: [int(float(x)) for x in ln.split(',')])
    dataRDD = images.zip(labels)
    df = spark.createDataFrame(dataRDD, ['image', 'label'])
else:
    raise Exception("Unsupported format: {}".format(args.format))

# Pipeline API

if args.train:
    # train a model using Spark Estimator fitted to a DataFrame
    print("{0} ===== Estimator.fit()".format(datetime.now().isoformat()))
    # dummy tf args (from imagenet/inception example)
    tf_args = {
        'initial_learning_rate': 0.045,
        'num_epochs_per_decay': 2.0,
        'learning_rate_decay_factor': 0.94
    }
    estimator = TFEstimator(mnist_dist_pipeline.map_fun, args, export_fn=mnist_dist_pipeline.export_fun) \
Ejemplo n.º 17
0
def test_large_historical_feature_retrieval(spark: SparkSession,
                                            large_entity_csv_file: str,
                                            large_feature_csv_file: str):
    nr_rows = 1000
    start_datetime = datetime(year=2020, month=8, day=31)
    expected_join_data = [(1000 + i, start_datetime + timedelta(days=i),
                           i * 10) for i in range(nr_rows)]
    expected_join_data_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("feature__total_bookings", IntegerType()),
    ])

    expected_join_data_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_join_data),
        expected_join_data_schema)

    entity_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path": f"file://{large_entity_csv_file}",
            "event_timestamp_column": "event_timestamp",
            "field_mapping": {
                "customer_id": "id"
            },
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    feature_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path": f"file://{large_feature_csv_file}",
            "event_timestamp_column": "event_timestamp",
            "created_timestamp_column": "created_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    feature_table = {
        "name": "feature",
        "entities": [{
            "name": "customer_id",
            "type": "int32"
        }],
        "features": [{
            "name": "total_bookings",
            "type": "int32"
        }],
        "max_age": 86400,
    }

    joined_df = retrieve_historical_features(spark, entity_source,
                                             [feature_source], [feature_table])
    assert_dataframe_equal(joined_df, expected_join_data_df)
Ejemplo n.º 18
0
            LRModel.save("GoogleStockModel")
            return print("\nModel saved successfully!")
        elif X.lower() == "n":
            return print("\nModel not saved!")
    except:
        print("Invalid Input! Try Again!")


try:
    print('\nStarting PySpark...')
    pdDataFrame = dataCleaner.df
    sc = SparkContext()
    sparkSession = SparkSession(sc)

    print('\nConverting Pandas DataFrame to PySpark DataFrame:')
    stockData = sparkSession.createDataFrame(pdDataFrame)
    print(stockData)

    print('\nPrinting Schema of PySpark DataFrame:')
    print(stockData.printSchema())

    print("\nPerforming Descriptive Analytics Operations:")
    print(stockData.describe().toPandas().transpose())

    print("\nSeperating the Open, High and Low:")
    featureAssembler = VectorAssembler(inputCols=["Open", "High", "Low"], outputCol="Features")
    output = featureAssembler.transform(stockData)
    print(output.show())

    print("\nChecking the Vectorized Feature:")
    print(output.select("Features").show())
Ejemplo n.º 19
0
def test_implicit_type_conversion(spark: SparkSession, ):
    test_data_dir = path.join(pathlib.Path(__file__).parent.absolute(), "data")
    entity_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path":
            f"file://{path.join(test_data_dir,  'single_customer.csv')}",
            "event_timestamp_column": "event_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    transaction_source = {
        "file": {
            "format": {
                "json_class": "CSVFormat"
            },
            "path": f"file://{path.join(test_data_dir,  'transactions.csv')}",
            "event_timestamp_column": "event_timestamp",
            "created_timestamp_column": "created_timestamp",
            "options": {
                "inferSchema": "true",
                "header": "true"
            },
        }
    }
    transaction_table = {
        "name": "transactions",
        "entities": [{
            "name": "customer_id",
            "type": "int32"
        }],
        "features": [{
            "name": "daily_transactions",
            "type": "float"
        }],
        "max_age": 86400,
    }

    joined_df = retrieve_historical_features(
        spark,
        entity_source,
        [transaction_source],
        [transaction_table],
    )

    expected_joined_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__daily_transactions", FloatType()),
    ])

    expected_joined_data = [
        (
            1001,
            datetime(year=2020, month=9, day=2),
            100.0,
        ),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_data),
        expected_joined_schema)

    assert_dataframe_equal(joined_df, expected_joined_df)
Ejemplo n.º 20
0
def test_automapper_nested_array_filter_with_parent_column(
    spark_session: SparkSession,
) -> None:
    schema = StructType(
        [
            StructField("row_id", dataType=IntegerType(), nullable=False),
            StructField(
                "location",
                dataType=ArrayType(
                    StructType(
                        [
                            StructField("name", StringType(), True),
                        ]
                    )
                ),
            ),
            StructField(
                "schedule",
                dataType=ArrayType(
                    StructType(
                        [
                            StructField("name", StringType(), True),
                            StructField(
                                "actor",
                                ArrayType(
                                    StructType(
                                        [StructField("reference", StringType(), True)]
                                    ),
                                    True,
                                ),
                            ),
                        ]
                    )
                ),
            ),
            StructField(
                "single_level",
                dataType=ArrayType(
                    StructType(
                        [
                            StructField("reference", StringType(), True),
                        ]
                    )
                ),
            ),
        ]
    )
    spark_session.createDataFrame(
        [
            (
                1,
                [{"name": "location-100"}, {"name": "location-200"}],
                [
                    {
                        "name": "schedule-1",
                        "actor": [
                            {"reference": "location-100"},
                            {"reference": "practitioner-role-100"},
                        ],
                    },
                    {
                        "name": "schedule-2",
                        "actor": [
                            {"reference": "location-200"},
                            {"reference": "practitioner-role-200"},
                        ],
                    },
                ],
                [{"reference": "location-100"}, {"reference": "location-200"}],
            )
        ],
        schema,
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    mapper = AutoMapper(
        view="schedule", source_view="patients", keys=["row_id"]
    ).columns(
        location=A.column("location").select(
            AutoMapperElasticSearchLocation(
                name=A.field("name"),
                scheduling=A.nested_array_filter(
                    array_field=A.column("schedule"),
                    inner_array_field=A.field("actor"),
                    match_property="reference",
                    match_value=A.field("{parent}.name"),
                ).select_one(AutoMapperElasticSearchSchedule(name=A.field("name"))),
            )
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    print("------COLUMN SPECS------")
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")
    assert_compare_expressions(
        sql_expressions["location"],
        transform(
            col("b.location"),
            lambda l: (
                struct(
                    l["name"].alias("name"),
                    transform(
                        filter(
                            col("b.schedule"),
                            lambda s: exists(
                                s["actor"],
                                lambda a: a["reference"] == l["name"],  # type: ignore
                            ),
                        ),
                        lambda s: struct(s["name"].alias("name")),
                    )[0].alias("scheduling"),
                )
            ),
        ).alias("___location"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    # Assert
    # result_df.printSchema()
    # result_df.show(truncate=False)
    location_row = result_df.collect()[0].location
    for index, location in enumerate(location_row):
        location_name = location.name
        location_scheduling = location.scheduling
        assert location_name == f"location-{index + 1}00"
        assert len(location_scheduling) == 1
        assert location_scheduling.name == f"schedule-{index + 1}"
Ejemplo n.º 21
0
def empty_integer_df(spark_session: SparkSession):
    return spark_session.createDataFrame([],
                                         schema=single_integer_column_schema)
Ejemplo n.º 22
0
  parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")

  args = parser.parse_args()
  print("args:", args)

  if args.format == 'tfr':
    # load TFRecords as a DataFrame
    df = dfutil.loadTFRecords(sc, args.images_labels)
  else:  # args.format == 'csv':
    # create RDD of input data
    def parse(ln):
      vec = [int(x) for x in ln.split(',')]
      return (vec[1:], vec[0])

    images_labels = sc.textFile(args.images_labels).map(parse)
    df = spark.createDataFrame(images_labels, ['image', 'label'])

  df.show()

  if args.mode == 'train':
    estimator = TFEstimator(main_fun, args) \
        .setInputMapping({'image': 'image', 'label': 'label'}) \
        .setModelDir(args.model_dir) \
        .setExportDir(args.export_dir) \
        .setClusterSize(args.cluster_size) \
        .setTensorboard(args.tensorboard) \
        .setEpochs(args.epochs) \
        .setBatchSize(args.batch_size) \
        .setGraceSecs(60)
    model = estimator.fit(df)
  else:  # args.mode == 'inference':
Ejemplo n.º 23
0
    user_index = create_user_index(ui_mat_rdd)
    doc_index = create_doc_index(ui_mat_rdd)
    b_uidx = sc.broadcast(user_index)
    b_didx = sc.broadcast(doc_index)

    ui_mat_rdd = ui_mat_rdd.map(lambda (usrId, docId, value): (b_uidx.value[
        usrId], b_didx.value[docId], value))
    num_users = ui_mat_rdd.map(lambda (usrId,docId,value): usrId) \
     .distinct() \
     .count()
    num_movies = ui_mat_rdd.map(lambda (usrId,docId,value): docId) \
     .distinct() \
     .count()
    print 'users:', num_users, 'products:', num_movies

    df = spark.createDataFrame(ui_mat_rdd, ['userId', 'movieId', 'value'])

    ui_mat_rdd.unpersist()

    print 'Splitting data set...'
    df = df.orderBy(F.rand())

    train_df, test_df = df.randomSplit([0.9, 0.1], seed=45)
    train_df, val_df = train_df.randomSplit([0.95, 0.05], seed=45)

    train_df = train_df.withColumn('flag', F.lit(0))
    val_df = val_df.withColumn('flag', F.lit(1))
    val_df = val_df.union(train_df)
    test_df = test_df.withColumn('flag', F.lit(2))
    test_df = test_df.union(train_df)
    test_df = test_df.union(val_df)
Ejemplo n.º 24
0
    spark_session = SparkSession(sc)
    connector = pshc.PSHC(sc, sqlContext)

    catelog = {
        "table": {
            "namespace": "default",
            "name": "hb_text"
        },
        "rowkey": "id",
        "columns": {
            "id": {
                "cf": "rowkey",
                "col": "key",
                "type": "string"
            },
            "text_file": {
                "cf": "data",
                "col": "text_file",
                "type": "string"
            },
        }
    }

    df = connector.get_df_from_hbase(catelog, repartition_num=1000).rdd.cache()
    print('======load file count=====', df.count())
    result_rdd = df.mapPartitions(lambda x: save_to_hive(x))
    result_df = spark_session.createDataFrame(result_rdd,
                                              ['id', 'text', 'index'])
    result_df.write.saveAsTable('abc.hb_text_oss_file', mode='overwrite')
    #print('======count=====', spark_session.table('abc.hb_text_oss_file').count())
Ejemplo n.º 25
0
def get_embryo_data(spark: SparkSession):
    r = requests.get("")
    df = spark.createDataFrame([json.loads(line) for line in r.iter_lines()])
    return df
Ejemplo n.º 26
0
def get_table_snapshot(spark: SparkSession,
                       s3_bucket: str,
                       table: str,
                       source: dict,
                       processing_date: str,
                       date_partition: bool = False) -> DataFrame:
    """
    :param spark: existing Spark session
    :param s3_bucket: s3 bucket name
    :param table: name of the table to be created in the catalog
    :param source: dictionary with cdc source settings
    :param processing_date: string with date to generate snapshot
    :param date_partition: specify if the output should be a date partition
    :returns DataFrame:
    """

    processing_date = parse_date(processing_date).date()
    if processing_date < source["cdc_start_date"]:
        raise ValueError(
            "processing_date must be after the source cdc_start_date")

    last_date = None
    table_exists = catalog_table_exists(spark, table)
    if table_exists:
        if date_partition:
            last_date = parse_date(
                get_table_partitions(spark, table)[-1].split("=")[1]).date()
        else:
            last_date = parse_date(get_current_version(spark, table)).date()
        if processing_date < last_date:
            raise ValueError("processing_date must be after last_partition")

    # Define a new schema with DMS specific columns from the source schema
    updates_schema = T.StructType().add("Op",
                                        "string").add("cdc_timestamp",
                                                      "string")
    for column in source["schema"]:
        updates_schema.add(column)

    # Clean the temporary updates directory
    spark.createDataFrame([], updates_schema) \
        .withColumn("file_number", F.lit(0)) \
        .withColumn("increasing_id", F.monotonically_increasing_id()) \
        .write.mode("overwrite").parquet(f"s3://{s3_bucket}/tmp/{table}_updates/")

    # Index update files into a temp folder to avoid loosing the
    # order to the records when Spark partitions the DataFrame
    def index_update_files(file_number: int,
                           file_path: str,
                           is_full_load: bool = False):
        df = spark.read.schema(updates_schema).parquet(file_path) \
            .withColumn("increasing_id", F.monotonically_increasing_id()) \
            .withColumn("file_number", F.lit(file_number))
        if is_full_load:
            df = df.withColumn(
                "cdc_timestamp",
                F.lit(f"{str(processing_date)} 00:00:00.000000"))
        df.write.mode("append").parquet(
            f"s3://{s3_bucket}/tmp/{table}_updates/")

    # Index the initial full load files if the processing_date is equal to the CDC start date,
    # or the table does not exists.
    # Note: the cdc_timestamp is reset to avoid having CDC updates files with earlier timestamps
    if processing_date == source["cdc_start_date"] or not table_exists:
        full_load_files = get_cdc_files(processing_date,
                                        source["path"],
                                        full_load=True)
        run_multi_threaded_map(
            mapped_function=lambda args: index_update_files(*args),
            args_mapped_function=[
                (number, path, True)
                for number, path in enumerate(full_load_files)
            ],
            thread_number=8)

    # Index the CDC update files between the last partition
    # created or the CDC start date and the processing_date.
    updates_files = get_cdc_files(processing_date, source["path"], last_date
                                  or source["cdc_start_date"])
    run_multi_threaded_map(
        mapped_function=lambda args: index_update_files(*args),
        args_mapped_function=[(number, path)
                              for number, path in enumerate(updates_files)],
        thread_number=64)

    # Generate a new snapshot from the indexed files
    window = Window.partitionBy(*source["primary_keys"]) \
        .orderBy("cdc_timestamp", "file_number", "increasing_id")
    index_window = Window.partitionBy(*source["primary_keys"]) \
        .orderBy(F.col("ordered_index").desc())
    new_snapshot = spark.read.parquet(f"s3://{s3_bucket}/tmp/{table}_updates/") \
        .withColumn("ordered_index", F.row_number().over(window)) \
        .filter(F.to_date(F.col("cdc_timestamp")) <= str(processing_date)) \
        .filter(F.to_date(F.col("cdc_timestamp")) >= str(last_date or source["cdc_start_date"])) \
        .withColumn("row_number", F.row_number().over(index_window)) \
        .filter(F.col("row_number") == 1)
    if date_partition:
        new_snapshot = new_snapshot.withColumn("dt", F.to_date(F.col(source['partition_date']))) \
            .filter(F.col("dt") == str(processing_date))

    # Generate an old_snapshot from the partitions that have to be updated
    # or get an empty DataFrame if the table does not exist
    final_snapshot_schema = T.StructType()
    for column in source["schema"]:
        final_snapshot_schema.add(column)
    if date_partition:
        final_snapshot_schema.add("dt", "date")
    old_snapshot = spark.createDataFrame([], final_snapshot_schema)
    if table_exists:
        if date_partition:
            old_snapshot = spark.read.table(table).filter(
                F.col("dt") == str(processing_date))
        else:
            last_table_location = f"s3://{s3_bucket}/{table}/version={str(last_date)}/"
            old_snapshot = load_dataframe(spark, last_table_location,
                                          source["schema"])

    # Merge both old and new snapshots to create a new snapshot
    # of the partition to be overwritten into the final table
    conditions = [
        old_snapshot[name] == new_snapshot[name]
        for name in source["primary_keys"]
    ]
    fields = map(
        lambda field: F.coalesce(new_snapshot[field.name], old_snapshot[
            field.name]).alias(field.name), final_snapshot_schema.fields)
    final_snapshot = old_snapshot.join(new_snapshot, conditions, how="outer") \
        .filter(new_snapshot.Op.isNull() | (new_snapshot.Op != 'D')) \
        .select(*fields)
    if date_partition:
        final_snapshot = final_snapshot.repartition("dt")
    return final_snapshot
class SCDHTest(testBase):

    def setUp(self):
        StockCustReturnByPrdInd.logLevel = 'debug'
        self.scdh = StockCustReturnByPrdInd(None)
        os.environ['SPARK_HOME'] = "/usr/local/Cellar/apache-spark/2.2.0/libexec"
        sys.path.append("/usr/local/Cellar/apache-spark/2.2.0/libexec/python")
        conf = SparkConf().setMaster("local").setAppName("hello")
        self.spark = SparkSession(SparkContext(conf=conf))

    def tearDown(self):
        self.spark.stop()

    def test_local_spark(self):
        doc = self.spark.createDataFrame([['a', 'b', 'c'], ['b', 'd', 'd']])
        print doc.show()
        print "successful!"

    def test_get_base_data(self):
        self.scdh._get_base_data("2017-03-16", "2017-03-18", 1, 5)

    def test_init_data(self):
        self.scdh.init_data()

    def test_daily_compute(self):
        self.scdh.daily_compute("2017-03-16", "2017-03-16")

    def test_check_1(self):
        sql = """
            SELECT * from adatatest.stock_cust_daily_return
            where short_return_rate>1 or long_return_rate>1 or total_return_rate>1
        """
        self.spark.sql(sql)

    def test_travel_row(self):
        # """
        # stock_cust_return_by_prd_ind.prd_ind	unknown
        # stock_cust_return_by_prd_ind.return	-44623.789999999964
        # stock_cust_return_by_prd_ind.return_rate	-0.006018969744297111
        # stock_cust_return_by_prd_ind.trade_id	12466
        # stock_cust_return_by_prd_ind.return_ratio	0.4610100448676952
        # stock_cust_return_by_prd_ind.return_rank	2
        # stock_cust_return_by_prd_ind.return_rate_rank	1
        # stock_cust_return_by_prd_ind.busi_date	2017-03-23
        # stock_cust_return_by_prd_ind.compute	7
        # """
        # spark.sql("""
        #   select  trade_id,prd_ind,collect_list(detail_item) detail_list from (
        #     select trade_id,trim(prd_ind) prd_ind,
        #            (str_to_map(concat(
        #                 'pre_mkt_val:',pre_mkt_val,
        #                 ',now_mkt_val:',now_mkt_val,
        #                 ',pos_cash_flow:',pos_cash_flow,
        #                 ',neg_cash_flow:',pos_cash_flow,
        #                 ',exception_label:',exception_label,
        #                 ',trd_type:',trd_type,
        #                 ',return:',return,
        #                 ',busi_date:',busi_date),",",":")) detail_item
        #     from adatatest.stock_cust_daily_holding
        #     where  busi_date<='2017-03-23' and trade_id='12466' and prd_ind='unknown'
        #   ) a
        #   GROUP  by trade_id,prd_ind
        # """)
        r = Row(trade_id=u'12466', prd_ind=u'unknown', detail_list=[
            {u'return': u'-13008.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1263402.0', u'now_mkt_val': u'1250394.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'6344.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'135176.0', u'now_mkt_val': u'141520.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'-12803.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1308384.0', u'now_mkt_val': u'1295581.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'-4.229999999999563', u'trd_type': u'long_related',
             u'pos_cash_flow': u'16940.23', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'16936.0', u'busi_date': u'2017-03-23',
             u'neg_cash_flow': u'16940.23'},
            {u'return': u'1612.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'208052.0', u'now_mkt_val': u'209664.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'35466.53', u'now_mkt_val': u'18526.3',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'4730.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'679400.0', u'now_mkt_val': u'684130.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'-1662.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'271183.0', u'now_mkt_val': u'269521.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'-693.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-130207.0', u'now_mkt_val': u'-130900.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'-21138.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1284540.0', u'now_mkt_val': u'1263402.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'2079.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-132286.0', u'now_mkt_val': u'-130207.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'6771.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'128405.0', u'now_mkt_val': u'135176.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'306163.19', u'now_mkt_val': u'35466.53',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'-12470.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'691870.0', u'now_mkt_val': u'679400.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'-122.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'128527.0', u'now_mkt_val': u'128405.0',
             u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'},
            {u'return': u'11.429999999999836', u'trd_type': u'long_related',
             u'pos_cash_flow': u'2273.57', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'2285.0', u'busi_date': u'2017-03-21', u'neg_cash_flow': u'2273.57'},
            {u'return': u'539.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-132825.0', u'now_mkt_val': u'-132286.0',
             u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'},
            {u'return': u'8673.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1327382.0', u'now_mkt_val': u'1336055.0',
             u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'},
            {u'return': u'15399.439999999944', u'trd_type': u'long_related',
             u'pos_cash_flow': u'1274560.56', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'1289960.0', u'busi_date': u'2017-03-20',
             u'neg_cash_flow': u'1274560.56'},
            {u'return': u'197.7399999999907', u'trd_type': u'short_related',
             u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'-132825.0', u'busi_date': u'2017-03-20', u'neg_cash_flow': u'0.0'},
            {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'3497135.97', u'now_mkt_val': u'1510820.28',
             u'busi_date': u'2017-03-20', u'neg_cash_flow': u'0.0'},
            {u'return': u'12845.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-1333311.0', u'now_mkt_val': u'-1320466.0',
             u'busi_date': u'2017-03-17', u'neg_cash_flow': u'0.0'},
            {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'3497135.97', u'now_mkt_val': u'3497135.97',
             u'busi_date': u'2017-03-17', u'neg_cash_flow': u'0.0'},
            {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'2177000.0', u'now_mkt_val': u'3497135.97',
             u'busi_date': u'2017-03-16', u'neg_cash_flow': u'0.0'},
            {u'return': u'-17.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'2247.0', u'now_mkt_val': u'2230.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'15414.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-1333311.0', u'now_mkt_val': u'-1317897.0',
             u'busi_date': u'2017-03-23', u'neg_cash_flow': u'0.0'},
            {u'return': u'-38.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'2285.0', u'now_mkt_val': u'2247.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'5138.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-1338449.0', u'now_mkt_val': u'-1333311.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'-27671.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1336055.0', u'now_mkt_val': u'1308384.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'-2808.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'210860.0', u'now_mkt_val': u'208052.0',
             u'busi_date': u'2017-03-22', u'neg_cash_flow': u'0.0'},
            {u'return': u'486.3400000000256', u'trd_type': u'long_related',
             u'pos_cash_flow': u'270696.66', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'271183.0', u'busi_date': u'2017-03-22',
             u'neg_cash_flow': u'270696.66'},
            {u'return': u'-2753.609999999986', u'trd_type': u'long_related',
             u'pos_cash_flow': u'694623.61', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'691870.0', u'busi_date': u'2017-03-21',
             u'neg_cash_flow': u'694623.61'},
            {u'return': u'-5420.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1289960.0', u'now_mkt_val': u'1284540.0',
             u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'},
            {u'return': u'-2569.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-1335880.0', u'now_mkt_val': u'-1338449.0',
             u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'},
            {u'return': u'0.0', u'trd_type': u'long_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'1510820.28', u'now_mkt_val': u'306163.19',
             u'busi_date': u'2017-03-21', u'neg_cash_flow': u'0.0'},
            {u'return': u'1299.6199999999953', u'trd_type': u'long_related',
             u'pos_cash_flow': u'209560.38', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'210860.0', u'busi_date': u'2017-03-21',
             u'neg_cash_flow': u'209560.38'},
            {u'return': u'-15414.0', u'trd_type': u'short_related', u'pos_cash_flow': u'0.0',
             u'exception_label': u'0', u'pre_mkt_val': u'-1320466.0', u'now_mkt_val': u'-1335880.0',
             u'busi_date': u'2017-03-20', u'neg_cash_flow': u'0.0'},
            {u'return': u'5451.600000000093', u'trd_type': u'long_related',
             u'pos_cash_flow': u'1321930.4', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'1327382.0', u'busi_date': u'2017-03-20',
             u'neg_cash_flow': u'1321930.4'},
            {u'return': u'150.9100000000035', u'trd_type': u'long_related',
             u'pos_cash_flow': u'128376.09', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'128527.0', u'busi_date': u'2017-03-20',
             u'neg_cash_flow': u'128376.09'},
            {u'return': u'-13175.030000000028', u'trd_type': u'short_related',
             u'pos_cash_flow': u'0.0', u'exception_label': u'0', u'pre_mkt_val': u'0.0',
             u'now_mkt_val': u'-1333311.0', u'busi_date': u'2017-03-16', u'neg_cash_flow': u'0.0'}])
        r2 = _travel_row(r, '2017-03-23')
        self.assertTrue(int(r2.get("return")), -44623)
def metrics (session: SparkSession, dataframe: pyspark.sql.DataFrame, actual: str,
             predicted: str) -> pyspark.sql.DataFrame:
    '''
    Calculates evaluation metrics from predicted results

    :param dataframe: spark.sql.dataframe with the real and predicted values
    :param actual: Name of column with observed target values
    :param predicted: Name of column with predicted values
    :return:
    '''

    # Along each row are the actual values and down each column are the predicted
    dataframe = dataframe.withColumn(actual, col(actual).cast('integer'))
    dataframe = dataframe.withColumn(predicted, col(predicted).cast('integer'))
    cm = dataframe.crosstab(actual, predicted)
    cm = cm.sort(cm.columns[0], ascending=True)

    # Adds missing column in case just one class was predicted
    if not '0' in cm.columns:
        cm = cm.withColumn('0', lit(0))
    if not '1' in cm.columns:
        cm = cm.withColumn('1', lit(0))

    # Subsets values from confusion matrix
    zero = cm.filter(cm[cm.columns[0]] == 0.0)
    first_0 = zero.take(1)

    one = cm.filter(cm[cm.columns[0]] == 1.0)
    first_1 = one.take(1)

    tn = first_0[0][1]
    fp = first_0[0][2]
    fn = first_1[0][1]
    tp = first_1[0][2]

    # Calculate metrics from values in the confussion matrix
    if (tp == 0):
        acc = float((tp + tn) / (tp + tn + fp + fn))
        sen = 0
        spe = float((tn) / (tn + fp))
        prec = 0
        rec = 0
        f1 = 0
    elif (tn == 0):
        acc = float((tp + tn) / (tp + tn + fp + fn))
        sen = float((tp) / (tp + fn))
        spe = 0
        prec = float((tp) / (tp + fp))
        rec = float((tp) / (tp + fn))
        f1 = 2 * float((prec * rec) / (prec + rec))
    else:
        acc = float((tp + tn) / (tp + tn + fp + fn))
        sen = float((tp) / (tp + fn))
        spe = float((tn) / (tn + fp))
        prec = float((tp) / (tp + fp))
        rec = float((tp) / (tp + fn))
        f1 = 2 * float((prec * rec) / (prec + rec))

    # Print results
    print('Confusion Matrix and Statistics: \n')
    cm.show()

    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', dataframe.count(), '\n')

    print('Accuracy: {0:.2f}'.format(acc))
    print('Sensitivity: {0:.2f}'.format(sen))
    print('Specificity: {0:.2f}'.format(spe))
    print('Precision: {0:.2f}'.format(prec))
    print('Recall: {0:.2f}'.format(rec))
    print('F1-score: {0:.2f}'.format(f1))

    # Create spark dataframe with results
    l = [(acc, sen, spe, prec, rec, f1)]
    df = session.createDataFrame(l, ['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Recall', 'F1'])
    return df
)
print(Q8.show(50))

print(
    "-------------------------------Q9 ANSWER----------------------------------"
)
data2 = data.withColumnRenamed('user id', 'user-id')

#Create Dataframe for age group like : [Row(rang=5,ind='0-5')]
i = 1
rng = []
while (i <= 80 / 5):
    rng.append((i * 5, str(i * 5 - 4) + str('-') + str(i * 5)))
    i = i + 1

RG = spark.createDataFrame(rng, ['rang', 'ind'])
RG.createOrReplaceTempView('rnge')

Q9prep = data2.withColumn("rating", data2["rating"].cast(IntegerType())).join(
    item, data2["item id"] == item["movie id"])
Q9prep2 = Q9prep.withColumnRenamed("movie id", "movie_id").withColumnRenamed(
    "item id",
    "item_id").join(user,
                    user["user id"] == Q9prep["user-id"]).withColumnRenamed(
                        "user id", "user_id").withColumn(
                            "age",
                            user["age"].cast(IntegerType())).withColumnRenamed(
                                "Children's", "children").withColumnRenamed(
                                    "Film-Noir", "filmnoir").withColumnRenamed(
                                        "Sci-Fi", "scifi").drop("user-id")
Q9prep2.createOrReplaceTempView("q9t")
class IonCentroidsGenerator(object):
    """ Generator of theoretical isotope peaks for all molecules in a database.

    Args
    ----------
    sc : pyspark.SparkContext
    moldb_name : str
    isocalc: IsocalcWrapper
    """
    def __init__(self, sc, moldb_name, isocalc):
        self._sc = sc
        self._moldb_name = moldb_name
        self._isocalc = isocalc
        self._sm_config = SMConfig.get_conf()
        self._parquet_chunks_n = 64
        self._iso_gen_part_n = 512

        self._spark_session = SparkSession(self._sc)
        self._ion_centroids_path = '{}/{}/{}/{}'.format(self._sm_config['isotope_storage']['path'],
                                                        self._moldb_name,
                                                        self._isocalc.sigma,
                                                        self._isocalc.charge)
        self.ion_df = None
        self.ion_centroids_df = None

    def exists(self):
        """ Check if ion centroids saved to parquet
        """
        if self._ion_centroids_path.startswith('s3a://'):
            cred_dict = dict(aws_access_key_id=self._sm_config['aws']['aws_access_key_id'],
                             aws_secret_access_key=self._sm_config['aws']['aws_secret_access_key'])
            bucket, key = split_s3_path(self._ion_centroids_path)
            s3 = boto3.client('s3', **cred_dict)
            try:
                s3.head_object(Bucket=bucket, Key=key + '/ions/_SUCCESS')
            except ClientError:
                return False
            else:
                return True
        else:
            return Path(self._ion_centroids_path + '/ions/_SUCCESS').exists()

    def generate(self, isocalc, sfs, adducts):
        """ Generate isotopic peaks

        Args
        ---
        isocalc: IsocalcWrapper
            Cannot be a class field as Spark doesn't allow to pass 'self' to functions
        adducts: list
        """
        logger.info('Generating molecular isotopic peaks')

        def calc_centroids(args):
            ion_i, sf, adduct = args
            mzs, ints = isocalc.ion_centroids(sf, adduct)
            if mzs is not None:
                return zip(repeat(ion_i),
                           range(0, len(mzs)),
                           map(float, mzs),
                           map(float, ints))
            else:
                return []

        ion_df = pd.DataFrame([(i, sf, adduct) for i, (sf, adduct) in
                               enumerate(sorted(product(sfs, adducts)))],
                              columns=['ion_i', 'sf', 'adduct']).set_index('ion_i')

        ion_centroids_rdd = (self._sc.parallelize(ion_df.reset_index().values,
                                                  numSlices=self._iso_gen_part_n)
                             .flatMap(calc_centroids))
        self.ion_centroids_df = (pd.DataFrame(data=ion_centroids_rdd.collect(),
                                              columns=['ion_i', 'peak_i', 'mz', 'int'])
                                 .sort_values(by='mz')
                                 .set_index('ion_i'))

        self.ion_df = ion_df.loc[self.ion_centroids_df.index.unique()]

        # Use when pandas DataFrames get way too big
        # ion_centroids_df = self._spark_session.createDataFrame(data=ion_centroids_rdd,
        #                                                        schema=self.ion_centroids_df_fields)
        # self.ion_centroids_df = (ion_centroids_df
        #                          .sort(ion_centroids_df.mz.asc())
        #                          .coalesce(self._parquet_chunks_n))

    def save(self):
        """ Save isotopic peaks
        """
        logger.info('Saving peaks')

        centr_spark_df = self._spark_session.createDataFrame(self.ion_centroids_df.reset_index())
        centr_spark_df.write.parquet(self._ion_centroids_path + '/ion_centroids', mode='overwrite')
        ion_spark_df = self._spark_session.createDataFrame(self.ion_df.reset_index())
        ion_spark_df.write.parquet(self._ion_centroids_path + '/ions', mode='overwrite')

    def restore(self):
        logger.info('Restoring peaks')

        self.ion_df = self._spark_session.read.parquet(
            self._ion_centroids_path + '/ions').toPandas().set_index('ion_i')
        self.ion_centroids_df = self._spark_session.read.parquet(
            self._ion_centroids_path + '/ion_centroids').toPandas().set_index('ion_i')

    def sf_adduct_centroids_df(self):
        return self.ion_df.join(self.ion_centroids_df).set_index(['sf', 'adduct'])

    def centroids_subset(self, ions):
        """ Restore isotopic peaks dataframe only for the 'ions'

        Args
        ---
        ions: list of tuples

        Returns
        ---
        : pandas.DataFrame
        """
        assert self.ion_df is not None

        ion_map = self.ion_df.reset_index().set_index(['sf', 'adduct']).ion_i
        ion_ids = ion_map.loc[ions].values
        return self.ion_centroids_df.loc[ion_ids].sort_values(by='mz')

    def generate_if_not_exist(self, isocalc, sfs, adducts):
        if not self.exists():
            self.generate(isocalc=isocalc, sfs=sfs, adducts=adducts)
            self.save()
        else:
            self.restore()

    def ions(self, adducts):
        return (self.ion_df[self.ion_df.adduct.isin(adducts)]
                .sort_values(by=['sf', 'adduct'])
                .to_records(index=False))
Ejemplo n.º 31
0
    cpsv_grava_aux = cpsv_grava.select('count').collect()
    cpsv_grava_aux[0] = str(cpsv_grava_aux[0])
    cpsv_grava_aux2 = re.sub(r'[^0-9]', '', ''.join(cpsv_grava_aux))
    cpsv_grava_counted = int(cpsv_grava_aux2)

    cpsv_hielo_aux = cpsv_hielo.select('count').collect()
    cpsv_hielo_aux[0] = str(cpsv_hielo_aux[0])
    cpsv_hielo_aux2 = re.sub(r'[^0-9]', '', ''.join(cpsv_hielo_aux))
    cpsv_hielo_counted = int(cpsv_hielo_aux2)

    cpsv_seca_aux = cpsv_seca.select('count').collect()
    cpsv_seca_aux[0] = str(cpsv_seca_aux[0])
    cpsv_seca_aux2 = re.sub(r'[^0-9]', '', ''.join(cpsv_seca_aux))
    cpsv_seca_counted = int(cpsv_seca_aux2)

    df_result = spark.createDataFrame(
        [("Condiciones Meteorologicas: Granizo", cpfa_granizo_counted),
         ("Condiciones Meteorologicas: Hielo", cpfa_hielo_counted),
         ("Condiciones Meteorologicas: Niebla", cpfa_nieve_counted),
         ("Condiciones Meteorologicas: Seco y Despejado", cpfa_seco_counted),
         ("Condiciones Meteorologicas: Nieve", cpfa_nieve_counted),
         ("Condiciones de la Via: Mojada", cpsv_mojada_counted),
         ("Condiciones de la Via: Derrape por aceite", cpsv_aceite_counted),
         ("Condiciones de la Via: Derrape por barro", cpsv_barro_counted),
         ("Condiciones de la Via: Via con grava", cpsv_grava_counted),
         ("Condiciones de la Via: Derrape por hielo", cpsv_hielo_counted),
         ("Condiciones de la Via: Siniestro en via seca y despejada",
          cpsv_seca_counted)], schema)
    df_result.orderBy(df_result["Number of accidents"].desc()).show(
        df_result.count(), False)