Esempi in Python per StructType, esempi in Python per pyspark.sql.types.StructType

Esempio n. 1

0

Mostra file

#cluster_seeds = ['199.60.17.32'] #for loading to cluster, in any case
cluster_seeds = ['127.0.0.1']
spark = SparkSession.builder.appName('Data going to Cassandra').config('spark.cassandra.connection.host', ','.join(cluster_seeds)).getOrCreate()
assert spark.version>='2.4'
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext
spark.conf.set("spark.sql.session.timeZone", "UTC")

craigslist_schema = types.StructType([
    types.StructField('posted',types.TimestampType()),
    types.StructField('region',types.StringType()),
    types.StructField('postingid',types.StringType()),
    types.StructField('image',types.StringType()),
    types.StructField('url',types.StringType()),
    types.StructField('labels',types.ArrayType(types.StringType())),
    types.StructField('beds',types.FloatType()),
    types.StructField('baths',types.FloatType()),
    types.StructField('city',types.StringType()),
    types.StructField('latitude',types.FloatType()),
    types.StructField('longitude',types.FloatType()),
    types.StructField('title', types.StringType()),
    types.StructField('price',types.FloatType()),
])

def transform(input_json):
    # labels - convert to lower and store as list
    label_arr=[]
    for key in input_json['labels']:
        label_arr.append(key.lower())
    input_json['labels']=label_arr

Esempio n. 2

0

Mostra file

File: reddit_relative.py Progetto: HansenHan/CMPT353

assert sys.version_info >= (3, 5) # make sure we have Python 3.5+
assert spark.version >= '2.3' # make sure we have Spark 2.3+

comments_schema = types.StructType([
    types.StructField('archived', types.BooleanType()),
    types.StructField('author', types.StringType()),
    types.StructField('author_flair_css_class', types.StringType()),
    types.StructField('author_flair_text', types.StringType()),
    types.StructField('body', types.StringType()),
    types.StructField('controversiality', types.LongType()),
    types.StructField('created_utc', types.StringType()),
    types.StructField('distinguished', types.StringType()),
    types.StructField('downs', types.LongType()),
    types.StructField('edited', types.StringType()),
    types.StructField('gilded', types.LongType()),
    types.StructField('id', types.StringType()),
    types.StructField('link_id', types.StringType()),
    types.StructField('name', types.StringType()),
    types.StructField('parent_id', types.StringType()),
    types.StructField('retrieved_on', types.LongType()),
    types.StructField('score', types.LongType()),
    types.StructField('score_hidden', types.BooleanType()),
    types.StructField('subreddit', types.StringType()),
    types.StructField('subreddit_id', types.StringType()),
    types.StructField('ups', types.LongType()),
    #types.StructField('year', types.IntegerType()),
    #types.StructField('month', types.IntegerType()),
])


def main(in_directory, out_directory):

Esempio n. 3

0

Mostra file

assert sys.version_info >= (3, 4)  # make sure we have Python 3.4+
assert spark.version >= '2.1'  # make sure we have Spark 2.1+

schema = types.StructType([  # commented-out fields won't be read
    #types.StructField('archived', types.BooleanType(), False),
    #types.StructField('author', types.StringType(), False),
    #types.StructField('author_flair_css_class', types.StringType(), False),
    #types.StructField('author_flair_text', types.StringType(), False),
    #types.StructField('body', types.StringType(), False),
    #types.StructField('controversiality', types.LongType(), False),
    #types.StructField('created_utc', types.StringType(), False),
    #types.StructField('distinguished', types.StringType(), False),
    #types.StructField('downs', types.LongType(), False),
    #types.StructField('edited', types.StringType(), False),
    #types.StructField('gilded', types.LongType(), False),
    #types.StructField('id', types.StringType(), False),
    #types.StructField('link_id', types.StringType(), False),
    #types.StructField('name', types.StringType(), False),
    #types.StructField('parent_id', types.StringType(), True),
    #types.StructField('retrieved_on', types.LongType(), False),
    types.StructField('score', types.LongType(), False),
    #types.StructField('score_hidden', types.BooleanType(), False),
    types.StructField('subreddit', types.StringType(), False),
    #types.StructField('subreddit_id', types.StringType(), False),
    #types.StructField('ups', types.LongType(), False),
])


def main(in_directory, out_directory):
    comments = spark.read.json(in_directory, schema=schema)

Esempio n. 4

0

Mostra file

import pyspark.sql.functions as psf
import pyspark.sql.types as pst

from streaming.spark import get_spark_context, enable_auto_compact, DELTA_FORMAT

schema = (pst.StructType().add("match_id", pst.IntegerType()).add(
    "price", pst.DoubleType()).add("ts", pst.TimestampType()).add(
        "score", pst.ArrayType(pst.IntegerType())))

spark = get_spark_context("consumePrices")
# enable_auto_compact(spark)

(spark.readStream.format("kafka").option(
    "kafka.bootstrap.servers",
    "localhost:9092").option("subscribe", "prices").option(
        "startingOffsets", "earliest").load().selectExpr(
            "CAST(key AS STRING)", "CAST(value AS STRING)").withColumn(
                "value", psf.from_json("value", schema)).selectExpr(
                    "value.match_id AS match_id",
                    "value.price AS price",
                    "value.ts AS ts",
                    "value.score[0] AS home_score",
                    "value.score[1] AS away_score",
                ).withColumn("exec_date",
                             psf.to_date("ts")).writeStream.format(
                                 "delta").outputMode("append").option(
                                     "checkpointLocation",
                                     "./_checkpoints/streaming")
 # .option("mergeSchema", "true")
 .partitionBy("exec_date")
 # .trigger(processingTime='5 minute')

Esempio n. 5

0

Mostra file

# MAGIC Print the contents of `README.txt`.

# COMMAND ----------

print(dbutils.fs.head('dbfs:/movielens/README.txt'))

# COMMAND ----------

# MAGIC %md
# MAGIC Load movies from `movies.csv`.

# COMMAND ----------

MovieType = T.StructType([
    T.StructField('movieId', T.IntegerType()),
    T.StructField('title', T.StringType()),
    T.StructField('genres', T.StringType()),
])

movies = (spark.read.option('header',
                            True).csv(movielensLocation + 'movies.csv',
                                      schema=MovieType))

# COMMAND ----------

display(movies)

# COMMAND ----------

# MAGIC %md
# MAGIC Set `genres` to missing when equal to '(no genres listed)', otherwise split into a string array.

Esempio n. 6

0

Mostra file

File: loadMortageInfo.py Progetto: martha92/housepriceprediction

from pyspark.sql import SparkSession, functions, types
from io import *
import csv
import pandas as pd
from urllib.request import *
import getCodeSets as codesets
spark = SparkSession.builder.appName('Load Mortage Data').getOrCreate()

mortage_schema = types.StructType([
    types.StructField('date', types.StringType(), True),
    types.StructField('1y_fixed_posted', types.StringType(), True),
    types.StructField('2y_bond', types.StringType(), True),
    types.StructField('3y_bond', types.StringType(), True),
    types.StructField('3y_fixed_posted', types.StringType(), True),
    types.StructField('5y_bond', types.StringType(), True),
    types.StructField('5y_fixed_posted', types.StringType(), True),
    types.StructField('7y_bond', types.StringType(), True),
    types.StructField('10y_bond', types.StringType(), True),
    types.StructField('bank', types.StringType(), True),
    types.StructField('overnight', types.StringType(), True),
    types.StructField('overnight_target', types.StringType(), True),
    types.StructField('prime', types.StringType(), True),
])


def loadMortageInfo():
    mortage = spark.read.csv(
        "Other_sources/mortgage rate since 1935.csv",
        schema=mortage_schema).createOrReplaceTempView("mortage")
    transf_year_month = spark.sql(
        "SELECT *, substr(m.date, 1, instr(m.date, '-') +2) as year_month FROM mortage m "

Esempio n. 7

0

Mostra file

File: CondPerfTest.py Progetto: StevenDavisTechNotes/SparkAggregationMethods

from pyspark.sql.window import Window
from pyspark.sql import Row
from pyspark.sql.functions import pandas_udf, PandasUDFType
import pandas as pd
import numpy as np
from numba import vectorize, jit, njit, prange, cuda
from numba import float64 as numba_float64

DataPoint = collections.namedtuple(
    "DataPoint", ["id", "grp", "subgrp", "A", "B", "C", "D", "E", "F"])
DataPointSchema = DataTypes.StructType([
    DataTypes.StructField('id', DataTypes.LongType(), False),
    DataTypes.StructField('grp', DataTypes.LongType(), False),
    DataTypes.StructField('subgrp', DataTypes.LongType(), False),
    DataTypes.StructField('A', DataTypes.LongType(), False),
    DataTypes.StructField('B', DataTypes.LongType(), False),
    DataTypes.StructField('C', DataTypes.DoubleType(), False),
    DataTypes.StructField('D', DataTypes.DoubleType(), False),
    DataTypes.StructField('E', DataTypes.DoubleType(), False),
    DataTypes.StructField('F', DataTypes.DoubleType(), False)
])


def generateData(numGrp1=3, numGrp2=3, repetition=1000):
    return [
        DataPoint(id=i,
                  grp=(i // numGrp2) % numGrp1,
                  subgrp=i % numGrp2,
                  A=random.randint(1, repetition),
                  B=random.randint(1, repetition),
                  C=random.uniform(1, 10),

Esempio n. 8

0

Mostra file

import sys
assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
import psycopg2
from pyspark.sql import SparkSession, functions as sf, types
spark = SparkSession.builder.appName('Cycle Data Load').config(
    'spark.driver.extraClassPath', 'postgresql-42.2.8.jar').getOrCreate()
spark.sparkContext.setLogLevel('WARN')
assert spark.version >= '2.4'  # make sure we have Spark 2.4+

cycles = types.StructType([
    types.StructField('Rental Id', types.IntegerType()),
    types.StructField('Duration', types.IntegerType()),
    types.StructField('Bike Id', types.IntegerType()),
    types.StructField('End Date', types.StringType()),
    types.StructField('EndStation Id', types.IntegerType()),
    types.StructField('EndStation Name', types.StringType()),
    types.StructField('Start Date', types.StringType()),
    types.StructField('StartStation Id', types.IntegerType()),
    types.StructField('StartStation Name', types.StringType()),
])

remfiles2019 = spark.read.option(
    "header", "true").schema(cycles).csv("data/cycling/rem2019/*.csv")
allfiles2016 = spark.read.option(
    "header", "true").schema(cycles).csv("data/cycling/2016TripDataZip/*.csv")
allfiles2015 = spark.read.option(
    "header", "true").schema(cycles).csv("data/cycling/2015TripDataZip/*.csv")
allfiles2014 = spark.read.option(
    "header",
    "true").schema(cycles).csv("data/cycling/cyclehireusagestats-2014/*.csv")
allfiles2013 = spark.read.option(

Esempio n. 9

0

Mostra file

import sys
from pyspark.sql import SparkSession, functions, types
# from pyspark.sql.functions import col

spark = SparkSession.builder.appName('wikipedia popular').getOrCreate()
spark.sparkContext.setLogLevel('WARN')

assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
assert spark.version >= '2.3'  # make sure we have Spark 2.3+

schema1 = types.StructType([
    types.StructField('lang', types.StringType()),
    types.StructField('content', types.StringType()),
    types.StructField('times', types.IntegerType()),
    types.StructField('bytes', types.IntegerType()),
])


def input_file_name(in_directory):
    return spark.read.csv(in_directory, schema=schema1,
                          sep=' ').withColumn('filename',
                                              functions.input_file_name())


# udf: user defined function
# cool cool cool. spent 5 mins to calculate the scope of the string length
def path_to_hour(path):
    filename = path.split('/')[-1]
    return filename[11:22]

Esempio n. 10

0

Mostra file

from pyspark import SparkContext

from pyspark.sql import SparkSession, types

sparkSess = SparkSession.builder.appName('badges').getOrCreate()
sc = sparkSess.sparkContext
bdschema = types.StructType([
    types.StructField('id', types.IntegerType()),
    types.StructField('name', types.StringType()),
    types.StructField('date', types.StringType()),
    types.StructField('user_id', types.IntegerType()),
    types.StructField('class', types.IntegerType()),
    types.StructField('tag_based', types.BooleanType())
])

sbad = sparkSess.read.format("s3selectCSV").schema(bdschema).options(
    header="true").load("s3://bigdata-4/badges.csv").select("id", "name")

sbad.write.mode("append").parquet("s3://bigdata-4/badges/")

Esempio n. 11

0

Mostra file

 def __init__(self, tpe):
     # Seems we cannot specify field names. I currently gave some default names
     # `c0, c1, ... cn`.
     self.tpe = types.StructType([
         types.StructField("c%s" % i, tpe[i]) for i in range(len(tpe))
     ])  # type: types.StructType

Esempio n. 12

0

Mostra file

File: test_utils.py Progetto: dlachasse/great_expectations

def get_dataset(
    dataset_type,
    data,
    schemas=None,
    profiler=ColumnsExistProfiler,
    caching=True,
    table_name=None,
    sqlite_db_path=None,
):
    """Utility to create datasets for json-formatted tests.
    """
    df = pd.DataFrame(data)
    if dataset_type == "PandasDataset":
        if schemas and "pandas" in schemas:
            schema = schemas["pandas"]
            pandas_schema = {}
            for (key, value) in schema.items():
                # Note, these are just names used in our internal schemas to build datasets *for internal tests*
                # Further, some changes in pandas internal about how datetimes are created means to support pandas
                # pre- 0.25, we need to explicitly specify when we want timezone.

                # We will use timestamp for timezone-aware (UTC only) dates in our tests
                if value.lower() in ["timestamp", "datetime64[ns, tz]"]:
                    df[key] = pd.to_datetime(df[key], utc=True)
                    continue
                elif value.lower() in [
                        "datetime", "datetime64", "datetime64[ns]"
                ]:
                    df[key] = pd.to_datetime(df[key])
                    continue
                try:
                    type_ = np.dtype(value)
                except TypeError:
                    type_ = getattr(pd.core.dtypes.dtypes, value)
                    # If this raises AttributeError it's okay: it means someone built a bad test
                pandas_schema[key] = type_
            # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()}
            df = df.astype(pandas_schema)
        return PandasDataset(df, profiler=profiler, caching=caching)

    elif dataset_type == "sqlite":
        if not create_engine:
            return None

        if sqlite_db_path is not None:
            engine = create_engine(f"sqlite:////{sqlite_db_path}")
        else:
            engine = create_engine("sqlite://")
        conn = engine.connect()
        # Add the data to the database as a new table

        sql_dtypes = {}
        if (schemas and "sqlite" in schemas
                and isinstance(engine.dialect, sqlitetypes.dialect)):
            schema = schemas["sqlite"]
            sql_dtypes = {
                col: SQLITE_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "postgresql":
        if not create_engine:
            return None

        # Create a new database
        engine = create_engine("postgresql://postgres@localhost/test_ci")
        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and "postgresql" in schemas
                and isinstance(engine.dialect, postgresqltypes.dialect)):
            schema = schemas["postgresql"]
            sql_dtypes = {
                col: POSTGRESQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "mysql":
        if not create_engine:
            return None

        engine = create_engine("mysql+pymysql://root@localhost/test_ci")
        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and "mysql" in schemas
                and isinstance(engine.dialect, mysqltypes.dialect)):
            schema = schemas["mysql"]
            sql_dtypes = {
                col: MYSQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "mssql":
        if not create_engine:
            return None

        engine = create_engine(
            "mssql+pyodbc://sa:ReallyStrongPwd1234%^&*@localhost:1433/test_ci?driver=ODBC Driver 17 for SQL Server&charset=utf8&autocommit=true",
            # echo=True,
        )

        # If "autocommit" is not desired to be on by default, then use the following pattern when explicit "autocommit"
        # is desired (e.g., for temporary tables, "autocommit" is off by default, so the override option may be useful).
        # engine.execute(sa.text(sql_query_string).execution_options(autocommit=True))

        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and dataset_type in schemas
                and isinstance(engine.dialect, mssqltypes.dialect)):
            schema = schemas[dataset_type]
            sql_dtypes = {
                col: MSSQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "SparkDFDataset":
        from pyspark.sql import SparkSession
        import pyspark.sql.types as sparktypes

        SPARK_TYPES = {
            "StringType": sparktypes.StringType,
            "IntegerType": sparktypes.IntegerType,
            "LongType": sparktypes.LongType,
            "DateType": sparktypes.DateType,
            "TimestampType": sparktypes.TimestampType,
            "FloatType": sparktypes.FloatType,
            "DoubleType": sparktypes.DoubleType,
            "BooleanType": sparktypes.BooleanType,
            "DataType": sparktypes.DataType,
            "NullType": sparktypes.NullType,
        }

        spark = SparkSession.builder.getOrCreate()
        # We need to allow null values in some column types that do not support them natively, so we skip
        # use of df in this case.
        data_reshaped = list(
            zip(*[v for _, v in data.items()]))  # create a list of rows
        if schemas and "spark" in schemas:
            schema = schemas["spark"]
            # sometimes first method causes Spark to throw a TypeError
            try:
                spark_schema = sparktypes.StructType([
                    sparktypes.StructField(column,
                                           SPARK_TYPES[schema[column]](), True)
                    for column in schema
                ])
                # We create these every time, which is painful for testing
                # However nuance around null treatment as well as the desire
                # for real datetime support in tests makes this necessary
                data = copy.deepcopy(data)
                if "ts" in data:
                    print(data)
                    print(schema)
                for col in schema:
                    type_ = schema[col]
                    if type_ in ["IntegerType", "LongType"]:
                        # Ints cannot be None...but None can be valid in Spark (as Null)
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(int(val))
                        data[col] = vals
                    elif type_ in ["FloatType", "DoubleType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(float(val))
                        data[col] = vals
                    elif type_ in ["DateType", "TimestampType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(parse(val))
                        data[col] = vals
                # Do this again, now that we have done type conversion using the provided schema
                data_reshaped = list(
                    zip(*[v
                          for _, v in data.items()]))  # create a list of rows
                spark_df = spark.createDataFrame(data_reshaped,
                                                 schema=spark_schema)
            except TypeError:
                string_schema = sparktypes.StructType([
                    sparktypes.StructField(column, sparktypes.StringType())
                    for column in schema
                ])
                spark_df = spark.createDataFrame(data_reshaped, string_schema)
                for c in spark_df.columns:
                    spark_df = spark_df.withColumn(
                        c, spark_df[c].cast(SPARK_TYPES[schema[c]]()))
        elif len(data_reshaped) == 0:
            # if we have an empty dataset and no schema, need to assign an arbitrary type
            columns = list(data.keys())
            spark_schema = sparktypes.StructType([
                sparktypes.StructField(column, sparktypes.StringType())
                for column in columns
            ])
            spark_df = spark.createDataFrame(data_reshaped, spark_schema)
        else:
            # if no schema provided, uses Spark's schema inference
            columns = list(data.keys())
            spark_df = spark.createDataFrame(data_reshaped, columns)
        return SparkDFDataset(spark_df, profiler=profiler, caching=caching)

    else:
        raise ValueError("Unknown dataset_type " + str(dataset_type))

Esempio n. 13

0

Mostra file

File: local.py Progetto: zyq11223/DeepCTR-1

pathFile = "D:/douyinData/train_1w.txt"
# pathFile="D:/douyinData/final_track2_train.txt"
rawRdd_train = sc.textFile(pathFile).map(lambda line: line.split('\t'))
print('finish read rdd, start to init action log rdd:')
actionLogRdd_train = rawRdd_train.map(lambda x: (int(x[0]), int(x[1]), int(x[
    2]), int(x[3]), int(x[4]), int(x[5]), int(x[6]), int(x[7]), int(x[
        8]), int(x[9]), int(x[10]), int(x[11])))
sqlContext = SQLContext(sc)
labels = [('uid', typ.IntegerType()), ('user_city', typ.IntegerType()),
          ('item_id', typ.IntegerType()), ('author_id', typ.IntegerType()),
          ('item_city', typ.IntegerType()), ('channel', typ.IntegerType()),
          ('finish', typ.IntegerType()), ('like', typ.IntegerType()),
          ('music_id', typ.IntegerType()), ('device', typ.IntegerType()),
          ('time', typ.LongType()), ('duration_time', typ.IntegerType())]
actionLogSchema = typ.StructType(
    [typ.StructField(e[0], e[1], True) for e in labels])

df_train = sqlContext.createDataFrame(actionLogRdd_train, actionLogSchema)

feature_group = ['uid', 'author_id']
df_tmp = df_train.select(feature_group)
df1 = df_tmp.groupby(feature_group).count()
df1.show(5)
df2 = df_tmp.groupby(feature_group[0]).count().withColumnRenamed(
    'count', feature_group[0] + '_count')
df2.show(5)

df1 = df1.join(df2, feature_group[0], 'left')
df1.show(5)
df1 = df1.withColumn(
    feature_group[1] + '_' + feature_group[0] + "_condition_ratio",

Esempio n. 14

0

Mostra file

File: typehints.py Progetto: goncaloperes/Framework_Spark

 def spark_type(self) -> types.StructType:
     return types.StructType([field.struct_field for field in self.fields])

Esempio n. 15

0

Mostra file

File: task2.py Progetto: DanAnastasyev/BigData

                    .getOrCreate()

spark_session.sparkContext.addFile('parse_tool.py')

from parse_tool import parse_logs, parse_geoinfo

# User logs collection
user_logs = spark_session.sparkContext.textFile("/data/access_logs/big_log/")

parsed_logs = user_logs.map(parse_logs)\
                       .map(lambda parse_res : [
                         parse_res[0],
                         parse_res[0] + parse_res[7]
                       ])

schema = tp.StructType().add("ip", tp.StringType())\
                        .add("user_id", tp.StringType())

user_log_df = spark_session.createDataFrame(parsed_logs, schema)


# Geo info collection
geoip = spark_session.sparkContext.textFile("/data/access_logs/geoiplookup/")

parsed_geoip = geoip.map(parse_geoinfo)\
                    .map(lambda parse_res : [
                         parse_res[0],
                         parse_res[1]
                       ])

schema = tp.StructType().add("ip", tp.StringType())\
                        .add("location", tp.StringType())

Esempio n. 16

0

Mostra file

File: parallel_correlation.py Progetto: jsully1996/Big-Data-Programming

import math
import re
import sys
assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+

from pyspark.sql import SparkSession, functions, types, Row

spark = SparkSession.builder.appName('example').getOrCreate()
sc = spark.sparkContext

log_schema = types.StructType([
    types.StructField('hostname', types.StringType(), False),
    types.StructField('num_bytes', types.IntegerType(), False),
])


def get_row(line):
    line_dissemble = re.compile(
        r'^(\S+) - - \[(\S+) [+-]\d+\] \"[A-Z]+ (\S+) HTTP/\d\.\d\" \d+ (\d+)$'
    )
    match = re.search(line_dissemble, line)
    if match:
        m = re.match(line_dissemble, line)
        host = m.group(1)
        bytes = int(m.group(4))
        row = Row(host, bytes)
        return row
    return None


def create_row_rdd(in_directory):

Esempio n. 17

0

Mostra file

File: DailyCoinsTreatment.py Progetto: Bionysar/Cryptocurrency-pairs-ETL

storage_account_name = ""
storage_account_access_key = ""
container = ""

# Set configuration to allow acces to the blob storage inside the storage account
file_location = f"wasbs://{container}@{storage_account_name}.blob.core.windows.net/"
file_type = "csv"
spark.conf.set(
  "fs.azure.account.key."+storage_account_name+".blob.core.windows.net",
  storage_account_access_key)

# Define schema and retrieve data from the blob storage
schema = t.StructType() \
        .add("time", t.StringType(), True) \
        .add("open", t.DoubleType(), True) \
        .add("close", t.DoubleType(), True) \
        .add("high", t.DoubleType(), True) \
        .add("low", t.DoubleType(), True) \
        .add("volume", t.DoubleType(), True) \
        .add("input_file", t.StringType(), True)
df = spark.read.format(file_type).options(header="true",inferSchema="true").schema(schema).load(file_location).withColumn("input_file", input_file_name())

# Get and split file name to create a column with the coin pair corresponding for each row
split_col = split(df['input_file'], '/')
df = df.withColumn('coin_pair', split(split_col.getItem(3),'\.').getItem(0))
df = df.drop("input_file")

# We have a timestamp and we want a date
df = df.withColumn('Date', from_unixtime((col('time')/1000)))

# Agregate data to have a daily result, ready to insert into the database
df = df.groupBy("coin_pair",window("Date","1 day")) \

Esempio n. 18

0

Mostra file

import sys
from pyspark.sql import SparkSession, functions, types

spark = SparkSession.builder.appName('first Spark app').getOrCreate()
spark.sparkContext.setLogLevel('WARN')

assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
assert spark.version >= '2.3'  # make sure we have Spark 2.3+

schema = types.StructType([
    types.StructField('id', types.IntegerType()),
    types.StructField('x', types.FloatType()),
    types.StructField('y', types.FloatType()),
    types.StructField('z', types.FloatType()),
])


def main(in_directory, out_directory):
    # Read the data from the JSON files
    xyz = spark.read.json(in_directory, schema=schema)
    #xyz.show(); return

    # Create a DF with what we need: x, (soon y,) and id%10 which we'll aggregate by.
    with_bins = xyz.select(
        xyz['x'],
        # TODO: also the y values
        xyz['y'],
        (xyz['id'] % 10).alias('bin'),
    )
    #with_bins.show(); return

Esempio n. 19

0

Mostra file

File: Cleaning_titanic.py Progetto: thibautbadoual/Titanic_Machine_Learning_Pyspark

from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pandas as pd

sc =sc = SparkContext.getOrCreate()
spark  =SparkSession.builder.getOrCreate()
sqlContext = SQLContext(sc)

schema = sparksqltypes.StructType([ 
sparksqltypes.StructField("PassengerId", sparksqltypes.DoubleType(), True), 
sparksqltypes.StructField("Survived", sparksqltypes.DoubleType(), True),
sparksqltypes.StructField("Pclass", sparksqltypes.DoubleType(), True),
sparksqltypes.StructField("Name", sparksqltypes.StringType(), True),
sparksqltypes.StructField("Sex", sparksqltypes.StringType(), True), 
sparksqltypes.StructField("Age", sparksqltypes.DoubleType(), True),
sparksqltypes.StructField("SibSp", sparksqltypes.DoubleType(), True), 
sparksqltypes.StructField("Parch", sparksqltypes.DoubleType(), True),
sparksqltypes.StructField("Ticket", sparksqltypes.StringType(), True),
sparksqltypes.StructField("Fare", sparksqltypes.DoubleType(), True),
sparksqltypes.StructField("Cabin", sparksqltypes.StringType(), True),
sparksqltypes.StructField("Embarked", sparksqltypes.StringType(), True)])

titanic = spark.read.csv('file:///C:/Users/Thibaut/Documents/ML/titanic_pyspark/titanic.csv',schema, header=True)

# ----------------------------------------------------------------------------------------------------------

def my_compute_function(titanic):
    
    # first step = feature engineering
    titanic = feature_engineering(titanic)

Esempio n. 20

0

Mostra file

File: etl.py Progetto: ftupas/spark-with-amazon-emr-step-function

log_data_path = os.path.join(config["DATA"]["input_path"], "log_data/*.json")
output_path = config["DATA"]["output_path"]
tables = ["songs", "artists", "users", "time", "songplays"]

# Schema for log_data and song_data
schema = {
    "log_data": T.StructType() \
                    .add("artist", T.StringType())
                    .add("auth", T.StringType())
                    .add("firstName", T.StringType())
                    .add("gender", T.StringType())
                    .add("itemInSession", T.IntegerType())
                    .add("lastName", T.StringType())
                    .add("length", T.FloatType())
                    .add("level", T.StringType())
                    .add("location", T.StringType())
                    .add("method", T.StringType())
                    .add("page", T.StringType())
                    .add("registration", T.FloatType())
                    .add("sessionId", T.IntegerType())
                    .add("song", T.StringType())
                    .add("status", T.IntegerType())
                    .add("ts", T.StringType())
                    .add("userAgent", T.StringType())
                    .add("userId", T.StringType()),
    "song_data": T.StructType() \
                    .add("artist_id", T.StringType())
                    .add("artist_latitude", T.FloatType())
                    .add("artist_location", T.StringType())
                    .add("artist_longitude", T.FloatType())
                    .add("artist_name", T.StringType())

Esempio n. 21

0

Mostra file

from pyspark.sql import SparkSession, types

spark = SparkSession.builder.appName('Train Data Analysis').config(
    'spark.driver.extraClassPath', 'postgresql-42.2.8.jar').getOrCreate()
assert spark.version >= '2.4'  # make sure we have Spark 2.4+
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext

schema_counts = types.StructType([
    types.StructField('locationcode', types.IntegerType()),
    types.StructField('station', types.StringType()),
    types.StructField('borough', types.StringType()),
    types.StructField('note', types.StringType()),
    types.StructField('entryweekday', types.LongType()),
    types.StructField('entrysaturday', types.LongType()),
    types.StructField('entrysunday', types.LongType()),
    types.StructField('exitweekday', types.LongType()),
    types.StructField('exitsaturday', types.LongType()),
    types.StructField('exitsunday', types.LongType()),
    types.StructField('entryexitinmillion', types.DoubleType()),
])


def main():
    counts()


def counts():
    file_counts_17 = '/home/anuj/Desktop/732project/data/counts/2017entryexit.csv'
    data_counts_17 = spark.read.csv(file_counts_17,

Esempio n. 22

0

Mostra file

File: test.py Progetto: liuhaoyu1994/318proj

import sys
import re
import numpy as np
from pyspark.sql import SparkSession, functions, types, Row

spark = SparkSession.builder.appName('read txt').getOrCreate()

assert sys.version_info >= (3, 4)  # make sure we have Python 3.4+
assert spark.version >= '2.1'  # make sure we have Spark 2.1+

schema = types.StructType([  # commented-out fields won't be read
    types.StructField('r', types.IntegerType(), False),
    types.StructField('g', types.IntegerType(), False),
    types.StructField('b', types.IntegerType(), False),
])


def some_function(path):
    return (path[0:11])


def split_func(string):
    return [int(x) for x in string.split(",")]


path_to_hour = functions.udf(some_function, returnType=types.StringType())


def main(in_directory, out_directory):
    ###
    sc = spark.sparkContext

Esempio n. 23

0

Mostra file

File: DataCleaning_LogisticRegression.py Progetto: 813460661/LogisticRegression

    ('INFANT_ASSIST_VENTI_6HRS', typ.StringType()),
    ('INFANT_NICU_ADMISSION', typ.StringType()),
    ('INFANT_SURFACANT', typ.StringType()),
    ('INFANT_ANTIBIOTICS', typ.StringType()),
    ('INFANT_SEIZURES', typ.StringType()),
    ('INFANT_NO_ABNORMALITIES', typ.StringType()),
    ('INFANT_ANCEPHALY', typ.StringType()),
    ('INFANT_MENINGOMYELOCELE', typ.StringType()),
    ('INFANT_LIMB_REDUCTION', typ.StringType()),
    ('INFANT_DOWN_SYNDROME', typ.StringType()),
    ('INFANT_SUSPECTED_CHROMOSOMAL_DISORDER', typ.StringType()),
    ('INFANT_NO_CONGENITAL_ANOMALIES_CHECKED', typ.StringType()),
    ('INFANT_BREASTFED', typ.StringType())
]

schema = typ.StructType([typ.StructField(e[0], e[1], False) for e in labels])

# spark = SparkSession.builder.config('spark.debug.maxToStringFields', '100').config('spark.io.compression.codec', 'snappy').appName("test").getOrCreate();
# births = spark.read.csv(datafile,header=True,schema=schema,encoding='ISO-8859-1')  #绑定的schema

spark = SparkSession.builder.config(
    'spark.debug.maxToStringFields',
    '100').config('spark.io.compression.codec',
                  'snappy').appName("test").getOrCreate()

#读取方法
# births=spark.read.format("csv").\
#     option("header","true")\
#     .load("births_train.csv")

births = spark.read.csv(datafile, header=True, schema=schema)

Esempio n. 24

0

Mostra file

# COMMAND ----------

nrmDssSchema = types.StructType([
    types.StructField("country", types.StringType()),
    types.StructField("calendar_year", types.StringType()),
    types.StructField("calendar_month", types.StringType()),
    types.StructField("calendar_day", types.StringType()),
    types.StructField("distributor", types.StringType()),
    types.StructField("site", types.StringType()),
    types.StructField("outlet", types.StringType()),
    types.StructField("billing_document", types.StringType()),
    types.StructField("billing_type", types.StringType()),
    types.StructField("billing_item", types.StringType()),
    types.StructField("product", types.StringType()),
    types.StructField("promotion_id", types.StringType()),
    types.StructField("promotion_desc1", types.StringType()),
    types.StructField("promo_start_date", types.StringType()),
    types.StructField("promo_end_date", types.StringType()),
    types.StructField("promotion_type", types.StringType()),
    types.StructField("value_based_promo_disc", types.DoubleType()),
    types.StructField("header_lvl_disc", types.DoubleType()),
    types.StructField("free_qty_in_cs", types.DoubleType()),
    types.StructField("free_qty_in_pc", types.DoubleType()),
    types.StructField("free_qty_val_in_cs", types.DoubleType()),
    types.StructField("free_qty_val_in_pc", types.DoubleType()),
    types.StructField("free_qty_retail_price_pc", types.DoubleType()),
    types.StructField("free_qty_retail_price_cs", types.DoubleType())
])

nrmRawDTRDISDF = spark.createDataFrame(nrmRawDTRDIS, schema=nrmDssSchema)
nrmRawDTRDISDF.createOrReplaceTempView("raw_nrm_data_dtrdis")

Esempio n. 25

0

Mostra file

File: wikipedia_popular_zc.py Progetto: HansenHan/CMPT353

import sys
import re
from pyspark.sql import SparkSession, functions, types

spark = SparkSession.builder.appName('reddit averages').getOrCreate()
spark.sparkContext.setLogLevel('WARN')

assert sys.version_info >= (3, 5) # make sure we have Python 3.5+
assert spark.version >= '2.3' # make sure we have Spark 2.3+


wiki_schema = types.StructType([
    types.StructField('language', types.StringType()),
    types.StructField('title', types.StringType()),
    types.StructField('request', types.IntegerType()),
    types.StructField('bytes', types.IntegerType())
])

def find_path(path):
    return re.search("\d{8}-\d{2}",path)[0]

def main(in_directory, out_directory):
    data = spark.read.csv(in_directory, schema=wiki_schema, sep = ' ' ).withColumn('filename', functions.input_file_name())
    data = data.filter(data['language']=='en')
    data = data.filter(data['title']!='Main_Page')
    data = data.drop(data['title'].startswith('Special:'))

    path_to_hour = functions.udf(find_path, returnType=types.StringType())
    data = data.withColumn('date', path_to_hour(data['filename'])).cache()
    # data = data.cache()
    group_data = data.groupby('date').max('request')

Esempio n. 26

0

Mostra file

from io import *
import pandas as pd
from urllib.request import *

spark = SparkSession.builder.appName('Load Crime Data').getOrCreate()

#Schema for Consumer Price Index
crime_schema = types.StructType([
    types.StructField('REF_DATE', types.StringType(), True),
    types.StructField('GEO', types.StringType(), True),
    types.StructField('DGUID', types.StringType(), True),
    types.StructField('Violations', types.StringType(), True),
    types.StructField('Statistics', types.StringType(), True),
    types.StructField('UOM', types.StringType(), True),
    types.StructField('UOM_ID', types.StringType(), True),
    types.StructField('SCALAR_FACTOR', types.StringType(), True),
    types.StructField('SCALAR_ID', types.StringType(), True),
    types.StructField('VECTOR', types.StringType(), True),
    types.StructField('COORDINATE', types.StringType(), True),
    types.StructField('VALUE', types.StringType(), True),
    types.StructField('STATUS', types.StringType(), True),
    types.StructField('SYMBOL', types.StringType(), True),
    types.StructField('TERMINATE', types.StringType(), True),
    types.StructField('DECIMALS', types.StringType(), True),
])
'''
	 * Description: This method is used to download and extract the zip file contents in memory.
	 * input: String -> url of response.
	 * output:  -> Panda DataFrame -> file contents.
'''

Esempio n. 27

0

Mostra file

File: create_sample_data.py Progetto: DKaczmarek/data-engineer-homework

import random

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import types as st

output_path = "data1.csv"

spark = SparkSession.builder.master("local").getOrCreate()
generator = random.Random()
generator.seed(2077)

schema = st.StructType([
    st.StructField("user", st.StringType(), True),
    st.StructField("value", st.IntegerType(), True),
    st.StructField("time", st.IntegerType(), True),
])

data = [(
    generator.choice(["a", "b", "c", "d"]),
    generator.randint(0, 100),
    generator.randint(0, 1000),
) for _ in range(0, 100)]

dataframe = spark.createDataFrame(data, schema)
dataframe.write.mode("overwrite").csv(output_path, header=True)

Esempio n. 28

0

Mostra file

File: grape_resilience.py Progetto: rsehmbi/WineAnalysis

spark = SparkSession.builder.appName('Grape Resilience').getOrCreate()
spark.sparkContext.setLogLevel('WARN')

assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
assert spark.version >= '2.4'  # make sure we have Spark 2.4+

data_schema = types.StructType([
    types.StructField("FullName",
                      types.StringType()),  # Winery + wine name + year
    types.StructField("Winery", types.StringType()),
    types.StructField("WineName", types.StringType()),
    types.StructField("Year", types.IntegerType()),
    types.StructField("Region", types.StringType()),
    types.StructField("RegionalVariety", types.StringType()),  # Varietal?
    types.StructField("VintageRating",
                      types.FloatType()),  # Average rating for vintage
    types.StructField("VintageRatingCount", types.IntegerType()),
    types.StructField("WineRating",
                      types.FloatType()),  # Average rating across vintages
    types.StructField("WineRatingCount", types.IntegerType()),
    types.StructField("VintagePrice", types.FloatType()),  # Same as below
    types.StructField("WinePrice", types.FloatType()),  # GBP/750ml
    types.StructField("VintageRatingPrice", types.FloatType()),  # rating/price
    types.StructField("WineRatingPrice", types.FloatType())  # rating/price
])


def main():
    data = spark.read.csv("white-wine-price-rating.csv",
                          header=True,
                          schema=data_schema)

Esempio n. 29

0

Mostra file

File: rh.py Progetto: lakshayydua/Weather-Prediction-for-United-States

cluster_seeds = ['199.60.17.171', '199.60.17.188']

cluster_seeds = ['199.60.17.171', '199.60.17.188']

conf = SparkConf().setAppName('example code') \
    .set('spark.cassandra.connection.host', ','.join(cluster_seeds))

spark = SparkSession.builder.appName('Big Data Project').getOrCreate()
sc = spark.sparkContext
assert sys.version_info >= (3, 4)  # make sure we have Python 3.4+
assert spark.version >= '2.2'  # make sure we have Spark 2.2+

schema = types.StructType([
    types.StructField('state_code', types.IntegerType(), True),
    types.StructField('month', types.IntegerType(), True),
    types.StructField('year', types.IntegerType(), True),
    types.StructField('am_rh', types.DoubleType(), True)
])

train_final = spark.createDataFrame(sc.emptyRDD(), schema=schema)

for year in range(2013, 2018):
    support = spark.read.csv(
        "/home/ldua/Desktop/BigDataProject/support/daily_RH_DP_" + str(year) +
        ".csv",
        header=True)

    support_f = support.select('State Code', 'Date Local', 'Arithmetic Mean')
    split_col = functions.split(support_f['Date Local'], '-')
    support_f = support_f.withColumn('Year', split_col.getItem(0))
    support_f = support_f.withColumn('Month', split_col.getItem(1))

Esempio n. 30

0

Mostra file

File: explore_face_spark.py Progetto: zyq11223/DeepCTR-1

    def data_describe(self):
        print('start to read data for rdd:')
        rawRdd_face = self.read_rdd('track2_face_attrs.txt').map(lambda line : json.loads(line))
        # rawRdd_face.cache()
        global keys
        keys=['item_id','gender','beauty','relative_position']
        rawRdd_face2=rawRdd_face.map(lambda dic:{key :jsonpath.jsonpath(dic,'$..'+key)[0] if jsonpath.jsonpath(dic,'$..'+key) else None  for key in keys})
        print(rawRdd_face2.take(10))
        #转化为dataframe,在不指定schema的情况下会自动推断
        sqlContext = SQLContext(self.sc)
        labels=[
            ('item_id',typ.IntegerType()),
            ('gender',typ.IntegerType()),
            ('beauty',typ.FloatType()),
            ('relative_position',typ.ArrayType(typ.FloatType()))]
        Schema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])
        df = sqlContext.createDataFrame(rawRdd_face2,Schema)

        attrs = self.sc.parallelize(["relative_position_" + str(i) for i in range(4)]).zipWithIndex().collect()
        print("列名：", attrs)
        for name, index in attrs:
            df = df.withColumn(name, fn.bround(df['relative_position'].getItem(index), scale=3))
        #删除 relative_position
        df_face =df.drop('relative_position')
        del df
        gc.collect()


        # print('-------保存df_face数据-------')
        # file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'face_feature'
        # os.system("hadoop fs -rm -r {}".format(file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        # df_face.rdd.map(tuple).saveAsPickleFile(file_path)
        # print('数据保存结束')

        print('start to read act data  only for uid and item_id :')
        rawRdd_train = self.read_rdd('final_track2_train.txt').map(lambda line : line.split('\t'))
        rawRdd_test = self.read_rdd('final_track2_test_no_anwser.txt').map(lambda line : line.split('\t'))
        actionLogRdd_train = rawRdd_train.map(
            lambda x :(int(x[0]), int(x[2])))
        # total = actionLogRdd_train.count()
        # print('total: ' + str(total))
        actionLogRdd_test = rawRdd_test.map(
            lambda x :(int(x[0]), int(x[2])))

        sqlContext = SQLContext(self.sc)
        labels=[('uid',typ.IntegerType()),
            ('item_id',typ.IntegerType())
            ]

        actionLogSchema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])

        dfactionLog_train = sqlContext.createDataFrame(actionLogRdd_train, actionLogSchema)
        dfactionLog_test = sqlContext.createDataFrame(actionLogRdd_test, actionLogSchema)

        #根据item_id进行关联
        df_face=df_face.select(["item_id","gender","beauty"])
        df_uid_face_test=dfactionLog_test.select(["uid","item_id"]).join(df_face,'item_id','left').drop("item_id")
        df_uid_face_train=dfactionLog_train.select(["uid","item_id"]).join(df_face,'item_id','left').drop("item_id")
        del dfactionLog_test
        del dfactionLog_train
        gc.collect()

        #只对训练集中的uid进行处理
        gdf=df_uid_face_train.groupby("uid")
        df1=gdf.agg(fn.max("beauty").alias("uid_max_beauty"),fn.bround(fn.avg("beauty"),scale=3).alias("uid_avg_beauty"),fn.bround((fn.sum("gender")/fn.count("gender")),scale=3).alias("uid_male_ratio"))
        df1.show(1,truncate=False)
        #最终只保留df1即可
        print('-------保存uid_face数据-------')
        file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'uid_face_train'
        os.system("hadoop fs -rm -r {}".format(file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df1.rdd.map(tuple).saveAsPickleFile(file_path)
        print('数据保存结束')