コード例 #1
0
ファイル: DfTransf.py プロジェクト: mood-agency/optimus
    def dateTransform(self, columns, currentFormat, outputFormat):
        """
        :param  columns     Name date columns to be transformed. Columns ha
        :param  currentFormat   currentFormat is the current string dat format of columns specified. Of course,
                                all columns specified must have the same format. Otherwise the function is going
                                to return tons of null values because the transformations in the columns with
                                different formats will fail.
        :param  outputFormat    output date string format to be expected.
        """
        # Check if currentFormat argument a string datatype:
        self.__assertTypeStr(currentFormat, "currentFormat")
        # Check if outputFormat argument a string datatype:
        self.__assertTypeStr(outputFormat, "outputFormat")
        # Check if columns argument must be a string or list datatype:
        self.__assertTypeStrOrList(columns, "columns")

        if type(columns) == type('str'): columns = [columns]

        # Check if columns to be process are in dataframe
        self.__assertColsInDF(columnsProvided=columns, columnsDF=self.__df.columns)

        exprs = [date_format(unix_timestamp(c, currentFormat).cast("timestamp"), outputFormat).alias(
            c) if c in columns else c for c in self.__df.columns]

        self.__df = self.__df.select(*exprs)

        self.__addTransformation()  # checkpoint in case

        return self
コード例 #2
0
ファイル: DfTransf.py プロジェクト: mood-agency/optimus
    def ageCalculate(self, column, dateFormat, nameColAge):
        """
        This method compute the age of clients based on their born dates.
        :param  column      Name of the column born dates column.
        :param  dateFormat  String format date of the column provided.
        :param  nameColAge  Name of the new column, the new columns is the resulting column of ages.

        """
        # Check if column argument a string datatype:
        self.__assertTypeStr(column, "column")

        # Check if dateFormat argument a string datatype:
        self.__assertTypeStr(dateFormat, "dateFormat")

        # Asserting if column if in dataFrame:
        assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame"

        # Output format date
        Format = "yyyy-MM-dd"  # Some SimpleDateFormat string

        exprs = format_number(
            mag(
                months_between(date_format(
                    unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias(
            nameColAge)

        self.__df = self.__df.withColumn(nameColAge, exprs)

        self.__addTransformation()  # checkpoint in case

        return self
コード例 #3
0
ファイル: etl.py プロジェクト: faizana/rbc_challenge
def parse_dates(df, format):
    """
    Parses dateinto year,month,day
    :param df: input df
    :param format: the format of the timestamp
    :return: dataframe
    """
    return df.withColumn('parsed_date',
                         f.to_timestamp(f.col('transaction_date'), format)) \
        .withColumn("year", f.year(f.col('parsed_date'))) \
        .withColumn("month", f.month(f.col('parsed_date'))) \
        .withColumn("day", f.dayofmonth(f.col('parsed_date'))) \
        .withColumn("unix_ts", f.unix_timestamp('parsed_date')) \
        .drop("transaction_date")
コード例 #4
0
ファイル: server.py プロジェクト: nlesc-sherlock/corpora-vis
def do_something_only_once():
    # the command I use to run this script:
    #~/spark-1.6.1/bin/spark-submit --packages=com.databricks:spark-avro_2.10:2.0.1,com.databricks:spark-csv_2.10:1.4.0 server.py
    global topdis, meta, dic, towo, cluto, doctopdat, maxdate, mindate, lda
    ## Loading of data
    sc = SparkContext(appName='Simple App') #"local"
    sqlContext = SQLContext(sc)
    # Load metadata avro
    reader = sqlContext.read.format('com.databricks.spark.avro')
    meta = reader.load('data/spark_metadata.avro')
    # # Loading topic distributions
    topdisFile = 'data/spark_output.tuples'
    csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    topdis = csvLoader.options(delimiter=',',header='false', inferschema='true').load(topdisFile)
    strip_first_col_int = udf(lambda row: int(row[1:]), IntegerType())
    topdis = topdis.withColumn('C0',strip_first_col_int(topdis['C0']))
    strip_first_col_float = udf(lambda row: float(row[1:]), FloatType())
    topdis = topdis.withColumn('C1',strip_first_col_float(topdis['C1']))
    strip_last_col = udf(lambda row: float(row[:-2]), FloatType())
    topdis = topdis.withColumn('C20',strip_last_col(topdis['C20']))
    # # Load dictionary CSV
    dicFile = 'data/spark_dic.csv'
    csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    dic = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(dicFile)
    dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count'))
    ldaFile = 'data/spark_lda.csv'
    csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    lda = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(ldaFile)
    lda = lda.select(rowNumber().alias('id'), lda.columns).join(dic, dic.id == lda.id, 'inner').cache()
    # dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count'))
    # # # Load clustertopics CSV
    # clutoFile = 'enron_small_clustertopics.csv'
    # csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    # cluto = csvLoader.options(delimiter=',', header='false', inferschema='true').load(clutoFile)
    # # # Load topicswords CSV
    # towoFile = 'enron_small_lda_transposed.csv'
    # csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    # towo = csvLoader.options(delimiter=',', header='false', inferschema='true').load(towoFile)
    # # Merge topdis which has document id and with metadata, based on document id
    metasmall = meta.select('id',unix_timestamp(meta['date'],"yyyy-MM-dd'T'HH:mm:ssX").alias("timestamp"))
    doctopdat = topdis.join(metasmall, metasmall.id == topdis.C0,'inner').cache()
    maxdate = doctopdat.select(max('timestamp').alias('maxtimestamp')).collect()[0]['maxtimestamp']
    mindate = doctopdat.select(min('timestamp').alias('mintimestamp')).collect()[0]['mintimestamp']
コード例 #5
0
# query5.show()
# pd = query5.toPandas()
# pd.to_csv('static/Input/byDevice.csv', index=False)

# Query 6: Number of tweets by Language

query6 = spark.sql(
    "SELECT user.lang, count(*) AS Lang_Count FROM tweets where user.lang is "
    "not null group by user.lang order by Lang_Count desc LIMIT 10")
query6.show()
pd = query6.toPandas()
pd.to_csv('static/Input/byLanguage.csv', index=False)

year = df.select(
    date_format((from_unixtime(
        unix_timestamp('user.created_at',
                       'EEE MMM dd HH:mm:ss ZZZZZ yyyy')).alias('date')),
                'MM/dd/yyyy').alias('date'))
year.printSchema()
year.createOrReplaceTempView("year")

# Query 7: Number of tweets by Year

query7 = spark.sql(
    "select count(*) as NumberOfTweets, substr(date, 7, 11) as MovieYear from year group by substr(date, 7, 11) "
    "order by substr(date, 7, 11) desc")
pd = query7.toPandas()
pd.to_csv('static/Input/byYear.csv', index=False)
query7.show()

# Query 8: Hashtag Count
コード例 #6
0
spark = SparkSession.builder.appName('App1').config(
    conf=SparkConf()).getOrCreate()
today = datetime.date.today()
for x in range(1, 9):
    date1 = str(today - datetime.timedelta(days=x))

    data = 's3://paragon-datalake/googleanalytics/landing/product_details/product_details_page' + date1 + '.csv'

    df = spark.read.format('csv').options(header='true',
                                          inferSchema='true',
                                          sep='\t').load(data)

    l1 = df.schema.fieldNames()
    l2 = []
    for header in l1:
        x = "-".join(header.split(" "))
        l2.append(x)

    data = df.toDF(*l2)
    #data.show()
    data = data.withColumn("date", data["date"].cast('String'))
    #data.printSchema()

    data = data.withColumn(
        'date',
        to_date(unix_timestamp(col('date'), 'yyyyMMdd').cast("timestamp")))

    data.write.mode("overwrite").parquet(
        's3://paragon-datalake/googleanalytics/processing/product_details/product_details_page'
        + date1)
コード例 #7
0
from datetime import datetime
from pyspark.sql.functions import to_date, udf
from pyspark.sql.types import DateType
from pyspark.sql.functions import unix_timestamp, from_unixtime
from Pipelines.getSparkSession import getSparkSession

spark = getSparkSession()
sc = spark.sparkContext

df = spark.createDataFrame([('1997-02-28', )], ['t'])
df.select(to_date(df.t, 'yyyy-MM-dd').alias('dt')).collect()

from pyspark.sql.functions import from_unixtime

df = spark.createDataFrame([("11/25/1991", ), ("11/24/1991", ),
                            ("11/30/1991", )], ['date_str'])

df2 = df.select(
    'date_str',
    from_unixtime(unix_timestamp('date_str', 'MM/dd/yyy')).alias('date'))

df2

df2.show()
コード例 #8
0
def get_static_features(spark):
    df_admissions = spark.read.csv(
        os.path.join(PATH_MIMIC_ORIGINAL_CSV_FILES, 'ADMISSIONS.csv'),
        header=True,
        inferSchema="false").drop("EXPIRE_FLAG").drop("DISCHARGE_LOCATION")
    df_patients = spark.read.csv(
        os.path.join(PATH_MIMIC_ORIGINAL_CSV_FILES, 'PATIENTS.csv'),
        header=True,
        inferSchema="false").select(['SUBJECT_ID', 'DOB', 'GENDER'])

    df_merge = df_admissions.join(df_patients, ['SUBJECT_ID'])

    timeFmt = "yyyy-MM-dd' 'HH:mm:ss"  # 2153-09-03 07:15:00
    timeDiff = F.round((F.unix_timestamp('ADMITTIME', format=timeFmt) -
                        F.unix_timestamp('DOB', format=timeFmt)) /
                       (60 * 60 * 24 * 365.242)).cast('integer')
    df_merge = df_merge.withColumn("AGE_ADMISSION", timeDiff)

    df_merge = QuantileDiscretizer(
        numBuckets=5, inputCol='AGE_ADMISSION',
        outputCol='QAGE').fit(df_merge).transform(df_merge)

    t = {
        0.0: 'very-young',
        1.0: 'young',
        2.0: 'normal',
        3.0: 'old',
        4.0: 'very-old'
    }
    udf_age = udf(lambda x: t[x], StringType())
    df_merge = df_merge.withColumn('AGE', udf_age('QAGE'))
    df_merge = reduce(DataFrame.drop, [
        'SUBJECT_ID', 'ROW_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
        'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'LANGUAGE', 'RELIGION',
        'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS', 'HOSPITAL_EXPIRE_FLAG',
        'HAS_CHARTEVENTS_DATA', 'GENDER', 'DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN',
        'EXPIRE_FLAG', 'AGE_ADMISSION', 'QAGE'
    ], df_merge)

    df_merge = df_merge.fillna({'MARITAL_STATUS': 'UNKNOWN_MARITAL'})

    categories = list(
        set(
            flatten([
                list(df_merge.select(c).distinct().collect())
                for c in df_merge.columns if c not in ['HADM_ID']
            ])))

    categories_dict = {}
    for i in range(len(categories)):
        categories_dict[categories[i]] = float(i)

    def mapFnLabels(row):
        one = categories_dict[row.AGE]
        two = categories_dict[row.ETHNICITY]
        three = categories_dict[row.MARITAL_STATUS]
        four = categories_dict[row.INSURANCE]
        feat_array = [one, two, three, four]
        four = categories_dict[row.MARITAL_STATUS]
        five = categories_dict[row.INSURANCE]
        feat_array = [one, two, four, five]
        return (row.HADM_ID, feat_array)

    hadmid_to_static_feats = df_merge.rdd.map(mapFnLabels)

    return hadmid_to_static_feats
コード例 #9
0
#Assign unique continuous numbes to rows of a dataframe
Z = spark.createDataFrame(d.select("colid").distinct().rdd.map(lambda x: x[0]).zipWithUniqueId())


#A window function calculates a return value for every input row of a table based on a group of rows, called the Frame
from pyspark.sql.window import Window
window = Window.partitionBy(tmp['prediction']).orderBy(df['clust_count'].desc())
tmp.select('*', F.rank().over(window).alias('rank')) .filter(col('rank') <= 5).head(5)


#K-means cluster	
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler
d = scaler.fit(d).transform(d)
kmeans = KMeans(k=5, seed=1, featuresCol="scaled")
model = kmeans.fit(d)
centers = model.clusterCenters()


#Converting date to timestamp
x = x.withColumn("unix_time", F.unix_timestamp(F.col("DATETIME"), format='yyyy-MM-dd HH:mm:ss'))

#pivot(pivot_col, values=None); pivot_col – Name of the column to pivot. values – List of values that will be translated to columns in the output DataFrame
df4.groupBy("year").pivot("course", ["dotNET", "Java"]).sum("earnings").collect()

#Groupby and count distinct / unique
df.groupby("device").agg(F.countDistinct(F.col("colid"))).toPandas()
df.groupby("device").agg(F.countDistinct(F.col("id"))).orderBy('count(DISTINCT id)', ascending = False)
コード例 #10
0
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import *
from pyspark.sql.types import *

## read data from mount point
read_path = "/bmathew/clickstream/000000_0"

jsonSchema = StructType([
    StructField("http_vhost_name", StringType(), True),
    StructField("event_time", TimestampType(), True),
    StructField("page", StringType(), True),
    StructField("user", StringType(), True),
    StructField("session", StringType(), True)
])

df = spark.read.format("json").schema(jsonSchema).load(read_path) \
    .select(current_timestamp().alias('event_load_time')
            ,from_unixtime(unix_timestamp(from_utc_timestamp(from_unixtime(unix_timestamp()), "PST")),"yyyy-MM-dd HH:mm:ss").alias("event_insert_time")
            ,col("http_vhost_name").alias("domain_name")
            ,regexp_replace(regexp_replace(col("event_time"),'T',' '),'Z','').alias("event_time")
            ,get_json_object('user', '$.browserId').alias('user_id'),get_json_object('page', '$.country').alias('country')
            ,get_json_object('page', '$.division').alias('division'),get_json_object('page', '$.app').alias('page_app')
            ,get_json_object('page', '$.id').alias('page_id'),get_json_object('user', '$.browser').alias('browser')
            ,get_json_object('user', '$.os').alias('os'),get_json_object('user', '$.platform').alias('platform')) \
    .where("country IN ('MX','TW','PE','CO','UY','PR','CL','ID','CA','GB','JP','US','PA')")

df.write.format("parquet").mode("overwrite").saveAsTable(
    "bmathew.test_databricks_api")
コード例 #11
0
def test_string_unix_timestamp(data_gen, date_form):
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: unary_op_df(spark, data_gen, seed=1).select(
            f.unix_timestamp(f.col('a'), date_form)))
コード例 #12
0
                                     fc_customer["SourceSystemCustomerId"])
va_customer = va_customer.withColumn("CustomerID",
                                     va_customer["SourceSystemCustomerId"])
sr_customer = sr_customer.withColumn("CustomerID",
                                     sr_customer["SourceSystemCustomerId"])

# COMMAND ----------

# DBTITLE 1,Convert FourthCoffee "CreatedDate"  & "UpdatedDate" to date datatype
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import from_unixtime
from pyspark.sql.functions import to_date

fc_customer = fc_customer.withColumn(
    "CreatedDate",
    to_date(unix_timestamp("CreatedDate", "yyyy-MM-dd").cast("timestamp")))
fc_customer = fc_customer.withColumn(
    "UpdatedDate",
    to_date(unix_timestamp("UpdatedDate", "yyyy-MM-dd").cast("timestamp")))

# COMMAND ----------

# DBTITLE 1,Convert FourthCoffee "ZipCode" to a string datatype
fc_customer.withColumn("ZipCode", fc_customer["ZipCode"].cast("string"))

# COMMAND ----------

# DBTITLE 1,Convert "CustomerId" & "PhoneNumber" to a Long data type
from pyspark.sql.types import IntegerType

fc_customer = fc_customer.withColumn(
コード例 #13
0
#import plotly.graph_objs as go
#from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import shutil
#init_notebook_mode(connected=True)
spark = SparkSession.builder.appName(
    "Python Spark SQL basic example").getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
df = sqlContext.read.load('/test_dev/mba-code/dataset.csv',
                          format='csv',
                          header='true',
                          inferSchema='true')
df.select(
    "InvoiceNo", "StockCode", "Description", "Quantity", "InvoiceDate",
    "InvoiceDateWS",
    date_format(from_unixtime(unix_timestamp('InvoiceDateWS', 'mm/dd/yyy')),
                'EEEE').alias('weekday'), "CustomerID",
    "Country").write.save("Invoices.parquet", format="parquet")
parquetFile = spark.read.parquet("Invoices.parquet")
# Parquet files can also be used to create a temporary view and then used in SQL statements.
parquetFile.createOrReplaceTempView("parquetFile")
from operator import add
from apyori import apriori, load_transactions
import pyspark.sql.functions as f
from mlxtend.preprocessing import TransactionEncoder
from pyspark.sql.functions import explode
DescriptionGrp = spark.sql(
    "SELECT distinct InvoiceNo,StockCode FROM parquetFile group by InvoiceNo,StockCode"
)
print(DescriptionGrp.rdd.take(2))
minSupport = 0.05 * DescriptionGrp.rdd.count()
コード例 #14
0
print("\n\n\nSample Records of Filtering are\n\n\n")
df_filter.show(10, False)

df1 = df_filter.withColumn(
    "prevTimestamp",
    lag(df_filter.timestamp, 1).over(
        Window.partitionBy(df_filter.ip_address).orderBy(df_filter.timestamp)))

#definging the time frame window duration of 15 minutes in sesocnds and time format
timeFmt = "yyyy-MM-dd'T'HH:mm:ss.SSS"
timeWindow = 900

#Adding new columns that are needed in calcualtions
df2 = df1.withColumn(
    "diff_seconds",
    F.unix_timestamp(df1.timestamp, format=timeFmt) -
    F.unix_timestamp(df1.prevTimestamp, format=timeFmt))

df3 = df2.withColumn("isNewSession",
                     F.when(df2.diff_seconds <= timeWindow, 0).otherwise(1))

df4 = df3.withColumn(
    "session_id",
    sum(df3.isNewSession).over(
        Window.partitionBy(df3.ip_address).orderBy(df3.ip_address,
                                                   df3.timestamp)))

df5 = df4.withColumn(
    "session_count_distinct",
    max(df4.session_id).over(Window.partitionBy(df4.ip_address)))
コード例 #15
0
# 时间窗口,移动平均
from pyspark.sql import functions as F
from pyspark.sql.window import Window

#function to calculate number of seconds from number of days
days = lambda i: i * 86400
df = spark.createDataFrame([(17, "2017-03-10"),
                            (15, "2017-03-10"),
                            (3, "2017-03-11"),
                            (13, "2017-03-12"),
                            (19, "2017-03-13"),
                            (25, "2017-03-14"),
                            (22, "2017-03-16")], ["dollars", "timestampGMT"])
df = df.withColumn('timestampGMT', df.timestampGMT.cast('timestamp'))
days = lambda i: i * 86400
#create window by casting timestamp to long (number of seconds)
# w = (Window.orderBy(F.col("add_dt_timestamp").cast('long')).rangeBetween(-days(3), 0)) # 此处-days(3)。注意这里往前看三天不包含当天,其实是4天
w = (Window.orderBy(F.col("timestampGMT").cast('long')).rangeBetween(0, days(2))) # 此处包含当天时间,例如:03-10往后两天到03-12, 其实是一共看了3天
df = df.withColumn('rolling_average', F.avg("dollars").over(w))
# 其实还可以先按天汇总,然后rowsBetween

# group by time period, 时间段分组
interval = 60*60*24*3  # 3天
df.withColumn('time_interval', F.from_unixtime(F.floor(F.unix_timestamp(F.col('timestampGMT'), "yyyy-MM-dd") / interval) * interval)) \
    .groupBy('time_interval') \
    .agg(F.collect_list("id").alias("id_list")).show()

# group by time period, 时间段分组2
df = df.withColumn("date_index", F.dense_rank().over(Window.orderBy(F.col('timestampGMT'))))
date_bucket_func = udf(lambda x: x//3 if x%3 else x//3-1)
df = df.withColumn("date_bucket", date_bucket_func(F.col('date_index')))
コード例 #16
0
# raw.describe().show()

# COMMAND ----------

#The user and repository with metrics to be charted
# USER_FOR_METRICS = getArgument("User")
REPOSITORY_FOR_METRICS = getArgument("Repository")

# COMMAND ----------

from pyspark.sql import functions, Row, Column
raw = raw.withColumn(
    'closed',
    functions.when(raw.closed.isNull(),
                   functions.current_timestamp()).otherwise(raw.closed))
duration = functions.unix_timestamp(raw.closed) - functions.unix_timestamp(
    raw.created)

processed_data = raw.withColumn('age', duration)
# processed_data.show()
# print(processed_data.count())
# display(processed_data)

# COMMAND ----------

#Finds number of rows in which url (id) appears, generating count for number of review teams
df1 = raw.withColumn('exploded_review_team',
                     functions.explode(raw.review_teams))

num_teams = df1.groupBy(df1.url).count()
num_teams = num_teams.withColumnRenamed('count',
tgtfilePathId = "s3://" + bucket + "/" + enriched_path + vendor + "/DataModels/riskrun/"
tgtfilePathHeader = "s3://" + bucket + "/" + enriched_path + vendor + "/DataModels/header/"
tgtfilePathDataLookup = "s3://" + bucket + "/" + enriched_path + vendor + "/DataModels/datalookup/"
tgtfilePathResult = "s3://" + bucket + "/" + enriched_path + vendor + "/DataModels/result/"

dfParquet = sparkSession.read.format("parquet").load(srcfilePath)

df = dfParquet.withColumn("year",sf.split("createdDate","\-")[0]) \
          .withColumn("month",sf.split("createdDate","\-")[1]) \
          .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0])

dfbaseData = df.select([col for col in df.columns])

dfjoin = dfbaseData.join(dfrmi,(dfbaseData.loanApplicationId == dfrmi.loan_application_id) & \
                                    (sf.unix_timestamp(dfbaseData.createdDatePT) - sf.unix_timestamp(dfrmi.date_created) >= 0),'left_outer') \
                    .join(dfrmsNS,(dfrmi.id == dfrmsNS.input_id),'left_outer') \
                    .select(dfbaseData.id \
                    ,dfbaseData.mvpApplicantId \
                    ,dfbaseData.loanApplicationId \
                    ,dfbaseData.mvpLoanApplicationId \
                    ,sf.regexp_replace(dfbaseData.createdDate,"T"," ").cast(TimestampType()).alias("ewsStatusTimestampUTC") \
                    ,dfbaseData.createdDatePT.alias("ewsStatusTimestampPT") \
                    ,dfrmi.id.alias("rmiId") \
                    ,dfrmi.date_created.alias("riskTimestampPT") \
                    ,dfrmsNS.score_type.alias("scoreType") \
                    ,dfbaseData.year \
                    ,dfbaseData.month \
                    ,dfbaseData.day)

dfrankedId =  dfjoin.withColumn("row_num", sf.row_number().over(Window.partitionBy("loanApplicationId","ewsStatusTimestampPT").orderBy(sf.desc("riskTimestampPT"),sf.desc("scoreType")))) \
コード例 #18
0
    def loadParquet(self):

        self.log.info('Exception Handling starts')

        validSourceFormat = self.isValidFormatInSource()

        if not validSourceFormat:
            self.log.error("Product Source files not in csv format.")

        validSourceSchema = self.isValidSchemaInSource()
        if not validSourceSchema:
            self.log.error(
                "Product Source schema does not have all the required columns."
            )

        if not validSourceFormat or not validSourceSchema:
            self.log.info(
                "Copy the source files to data processing error path and return."
            )
            self.copyFile(
                self.productRQ4File, self.dataProcessingErrorPath + '/' +
                self.productRQ4Name + self.fileFormat)
            self.copyFile(
                self.couponsFile, self.dataProcessingErrorPath + '/' +
                self.couponsName + self.fileFormat)
            self.copyFile(
                self.productIdentifierFile, self.dataProcessingErrorPath +
                '/' + self.productIdentifierName + self.fileFormat)
            return

        self.log.info('Source format and schema validation successful.')
        self.log.info('Reading the input parquet file')

        dfProduct = self.sparkSession.read.format("com.databricks.spark.csv"). \
            option("encoding", "UTF-8"). \
            option("ignoreLeadingWhiteSpace", "true"). \
            option("ignoreTrailingWhiteSpace", "true"). \
            option("header", "true"). \
            option("treatEmptyValuesAsNulls", "true"). \
            option("inferSchema", "true"). \
            option("escape", '"'). \
            option("quote", "\""). \
            option("multiLine", "true"). \
            load(self.productRQ4FilePath).toDF(*self.productRQ4Header)
        dfProductIden = self.sparkSession.read.format("com.databricks.spark.csv"). \
            option("encoding", "UTF-8"). \
            option("ignoreLeadingWhiteSpace", "true"). \
            option("ignoreTrailingWhiteSpace", "true"). \
            option("header", "true"). \
            option("treatEmptyValuesAsNulls", "true"). \
            option("inferSchema", "true"). \
            option("escape", '"'). \
            option("quote", "\""). \
            option("multiLine", "true"). \
            load(self.productIdentifierFilePath).toDF(*self.productIdentifierHeader)

        dfCoupons = self.sparkSession.read.format("com.databricks.spark.csv"). \
            option("encoding", "UTF-8"). \
            option("ignoreLeadingWhiteSpace", "true"). \
            option("ignoreTrailingWhiteSpace", "true"). \
            option("header", "true"). \
            option("treatEmptyValuesAsNulls", "true"). \
            option("inferSchema", "true"). \
            option("escape", '"'). \
            option("quote", "\""). \
            option("multiLine", "true"). \
            load(self.couponsFilePath).toDF(*self.couponsHeader)

        dfProduct.coalesce(1).write.mode('overwrite').format('parquet').save(
            self.productRQ4OutputWorkingPath)

        dfCoupons.coalesce(1).write.mode('overwrite').format('parquet').save(
            self.couponsOutputWorkingPath)

        dfProductIden.coalesce(1).write.mode('overwrite').format(
            'parquet').save(self.productIdentifierOutputWorkingPath)

        dfProduct.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\
            withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).\
            write.mode('append').partitionBy('year', 'month').format('parquet').\
            save(self.productRQ4OutputPartitionPath)

        dfCoupons.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\
            withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).\
            write.mode('append').partitionBy('year', 'month').format('parquet').\
            save(self.couponsOutputPartitionPath)

        dfProductIden.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\
            withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).\
            write.mode('append').partitionBy('year', 'month').format('parquet').\
            save(self.productIdentifierOutputPartitionPath)

        self.sparkSession.stop()
コード例 #19
0
    spark = SparkSession.builder.appName(
        'sp500_agg_calculations').getOrCreate()

    # reading in the data
    data = spark.read.format('csv') \
                    .options(header='true', inferschema='false') \
                    .load(sys.argv[1])
    data = data.select('Symbol',
                        'Name',
                        'Date',
                        'Log Return') \
                .filter(col('Log Return') != '')
    data = data.withColumn(
        'Date',
        to_date(unix_timestamp(col('Date'), 'yyyy-MM-dd').cast('timestamp')))
    data = data.withColumn('Log Return',
                           col('Log Return').cast('float').alias('Log Return'))

    # cumulative log return sum
    cum_window = Window.partitionBy(data['Symbol']).orderBy(
        data['Date']).rangeBetween(Window.unboundedPreceding, 0)
    cum_sum = data.withColumn('CumulativeSum',
                              sum('Log Return').over(cum_window))
    cum_sum = cum_sum.orderBy(col('Symbol'), col('Date'))

    # total log return sum
    sum_window = Window.partitionBy('Symbol')
    sp_sum = cum_sum.withColumn('MaxDate', max('Date').over(sum_window)) \
                    .where(col('Date') == col('MaxDate')) \
                    .drop('MaxDate', 'Log Return', 'Date') \
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    # job = Job(glueContext)
    # job.init(args['JOB_NAME'], args)
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")

    is_dev = True

    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today_second = long(today.strftime("%s"))
    print('today_id: ', today_second)

    #------------------------------------------------------------------------------------------------------------------#

    # ------------------------------------------------------------------------------------------------------------------#

    def concaText(student_behavior_date, behavior_id, student_id, contact_id,
                  package_code, package_endtime, package_starttime,
                  student_level_code, student_status_code, transformed_at):
        text_concat = ""
        if student_behavior_date is not None:
            text_concat += str(student_behavior_date)
        if behavior_id is not None:
            text_concat += str(behavior_id)
        if student_id is not None:
            text_concat += str(student_id)
        if contact_id is not None:
            text_concat += str(contact_id)
        if package_code is not None:
            text_concat += str(package_code)
        if package_endtime is not None:
            text_concat += str(package_endtime)
        if package_starttime is not None:
            text_concat += str(package_starttime)
        if student_level_code is not None:
            text_concat += str(student_level_code)
        if student_status_code is not None:
            text_concat += str(student_status_code)
        if transformed_at is not None:
            text_concat += str(transformed_at)
        return text_concat

    concaText = f.udf(concaText, StringType())

    def convertStudentIdToLong(student_id):
        try:
            student_id_long = long(student_id)
            return student_id_long
        except:
            return 0L

    convertStudentIdToLong = f.udf(convertStudentIdToLong, LongType())
    # ------------------------------------------------------------------------------------------------------------------#

    ##################################################
    # Lay du lieu kiem tra ky thuat trong bang student_technical_test
    dyf_datasourceTech = glueContext.create_dynamic_frame.from_catalog(
        database="dm_toa", table_name="student_technical_test_odin")

    # print('dyf_datasourceTech')
    # dyf_datasourceTech.printSchema()

    # Chon cac truong can thiet
    dyf_datasourceTech = dyf_datasourceTech.select_fields([
        '_key', 'thoigianhenktkt', 'ketluan', 'emailhocvien', 'dauthoigian',
        'emailadvisor', 'nguoitiepnhan', 'trinhdohocvien'
    ])

    dyf_datasourceTech = dyf_datasourceTech.resolveChoice(
        specs=[('_key', 'cast:long')])

    if (dyf_datasourceTech.count() > 0):
        dyf_datasourceTech = Filter.apply(
            frame=dyf_datasourceTech,
            f=lambda x: x["emailhocvien"] is not None and x["emailhocvien"] !=
            '' and x["thoigianhenktkt"] is not None and x[
                "thoigianhenktkt"] != '' and x["ketluan"] == 'Pass')

        dyf_datasourceTech_numeber = dyf_datasourceTech.count()
        print("Count data 2:  ", dyf_datasourceTech_numeber)

        if dyf_datasourceTech_numeber < 1:
            return

        dy_datasourceTech = dyf_datasourceTech.toDF()

        dy_datasourceTech = dy_datasourceTech.limit(100)

        print('dy_datasourceTech')
        dy_datasourceTech.printSchema()
        dy_datasourceTech = dy_datasourceTech.withColumn(
            'thoigianhenktkt_id',
            f.unix_timestamp('thoigianhenktkt',
                             'yyyy-MM-dd HH:mm:ss').cast('long'))

        print('dy_datasourceTech__2')
        dy_datasourceTech.printSchema()

        # lay thoi gian kich hoat dau tien
        w2 = Window.partitionBy("emailhocvien").orderBy(
            f.col("thoigianhenktkt_id").desc())
        dy_datasourceTech = dy_datasourceTech.withColumn("row", f.row_number().over(w2)) \
            .where(f.col('row') <= 1) \

        print('dy_datasourceTech__3')
        dy_datasourceTech.printSchema()

        #--------------------------------------------------------------------------------------------------------------#
        dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
            database="tig_advisor", table_name="student_contact")

        # chon cac field
        dyf_student_contact = dyf_student_contact.select_fields(
            ['_key', 'contact_id', 'student_id', 'user_name'])

        dyf_student_contact = Filter.apply(
            frame=dyf_student_contact,
            f=lambda x: x["contact_id"] is not None and x["contact_id"] != ''
            and x["student_id"] is not None and x["student_id"] != '' and x[
                "user_name"] is not None and x["user_name"] != '')

        dyf_student_contact_number = dyf_student_contact.count()
        print('dyf_student_contact_number::number: ',
              dyf_student_contact_number)
        if dyf_student_contact_number < 1:
            return

        dy_student_contact = dyf_student_contact.toDF()
        dy_student_contact.dropDuplicates(['student_id'])

        dy_join_teach_concat = dy_datasourceTech.join(
            dy_student_contact,
            dy_datasourceTech.emailhocvien == dy_student_contact.user_name)

        print('dyf_join_teach_concat::schema')
        dy_join_teach_concat.printSchema()

        join_teach_concat_number = dy_join_teach_concat.count()
        print('join_teach_concat_number::number: ', join_teach_concat_number)
        if join_teach_concat_number < 1:
            return

        #--------------------------------------------------------------------------------------------------------------#

        dyf_student_package_status = glueContext.create_dynamic_frame.from_catalog(
            database="od_student_behavior", table_name="student_status")

        dyf_student_package_status = dyf_student_package_status\
            .select_fields(['contact_id', 'status_code', 'start_date', 'end_date'])\
            .rename_field('contact_id', 'contact_id_ps')

        print('dyf_student_package_status::drop_duplicates')

        df_student_package_status = dyf_student_package_status.toDF()
        print('dyf_student_package_status::drop_duplicates::before: ',
              df_student_package_status.count())
        df_student_package_status = df_student_package_status.drop_duplicates()
        print('dyf_student_package_status::drop_duplicates::after: ',
              df_student_package_status.count())

        print('dy_student_package_status')
        df_student_package_status.printSchema()
        # --------------------------------------------------------------------------------------------------------------#
        dyf_student_package = glueContext.create_dynamic_frame.from_catalog(
            database="od_student_behavior", table_name="student_package")

        print('dyf_student_package__0')
        dyf_student_package.printSchema()


        dyf_student_package = dyf_student_package \
            .select_fields(['student_id', 'package_code', 'start_time', 'end_time'])\
            .rename_field('student_id', 'student_id_pk')

        # --------------------------------------------------------------------------------------------------------------#

        print('dyf_student_package__1')
        dyf_student_package.printSchema()

        dyf_student_package = dyf_student_package.resolveChoice(
            specs=[('start_time', 'cast:long'), ('end_time', 'cast:long')])

        print('dyf_student_package__2')
        dyf_student_package.printSchema()

        df_student_package = dyf_student_package.toDF()
        print('df_student_package::drop_duplicates::before: ',
              df_student_package.count())
        df_student_package = df_student_package.drop_duplicates()
        print('df_student_package::drop_duplicates::after: ',
              df_student_package.count())

        print('df_student_package')
        df_student_package.printSchema()
        df_student_package.show(3)

        df_student_package_number = df_student_package.count()
        print('df_student_package_number: ', df_student_package_number)

        # --------------------------------------------------------------------------------------------------------------#

        # --------------------------------------------------------------------------------------------------------------#

        dy_join_teach_concat_number = dy_join_teach_concat.count()
        print('dy_join_teach_concat_number: ', dy_join_teach_concat_number)

        join_result = dy_join_teach_concat\
            .join(df_student_package_status,
                 (dy_join_teach_concat.contact_id == df_student_package_status.contact_id_ps)
                 & (dy_join_teach_concat.thoigianhenktkt_id >= df_student_package_status.start_date)
                 & (dy_join_teach_concat.thoigianhenktkt_id < df_student_package_status.end_date),
                 'left'
                  )\
            .join(df_student_package,
                  (dy_join_teach_concat.student_id == df_student_package.student_id_pk)
                  &(dy_join_teach_concat.thoigianhenktkt_id >= df_student_package.start_time)
                  &(dy_join_teach_concat.thoigianhenktkt_id < df_student_package.end_time),
                  'left'
                  )

        print('join_result')
        join_result.printSchema()

        join_result_number = join_result.count()
        print('join_result_number: ', join_result_number)
        if join_result_number < 1:
            return
        join_result.show(3)

        student_id_unavailable = 0L
        package_endtime_unavailable = 99999999999L
        package_starttime_unavailable = 0L
        package_code_unavailable = 'UNAVAILABLE'
        student_level_code_unavailable = 'UNAVAILABLE'
        student_status_code_unavailable = 'UNAVAILABLE'
        measure1_unavailable = 0
        measure2_unavailable = 0
        measure3_unavailable = 0
        measure4_unavailable = float(0.0)

        # join_result = join_result.withColumnRenamed('student_id', 'student_id_a')

        join_result = join_result.select(
            join_result.thoigianhenktkt_id.alias('student_behavior_date'),
            f.lit(5L).alias('behavior_id'),
            'student_id',
            join_result.contact_id.alias('contact_id'),
            join_result.package_code.alias('package_code'),
            join_result.end_time.cast('long').alias('package_endtime'),
            join_result.start_time.cast('long').alias('package_starttime'),
            join_result.trinhdohocvien.cast('string').alias(
                'student_level_code'),
            join_result.status_code.cast('string').alias(
                'student_status_code'),
            f.lit(today_second).cast('long').alias('transformed_at'),
        )

        join_result = join_result.na.fill({
            'package_code':
            package_code_unavailable,
            'package_endtime':
            package_starttime_unavailable,
            'package_starttime':
            package_endtime_unavailable,
            'student_level_code':
            student_level_code_unavailable,
            'student_status_code':
            student_status_code_unavailable
        })

        print('join_result--1')
        join_result.printSchema()
        join_result.show(1)

        join_result = join_result.withColumn(
            'student_behavior_id',
            f.md5(
                concaText(join_result.student_behavior_date,
                          join_result.behavior_id, join_result.student_id,
                          join_result.contact_id, join_result.package_code,
                          join_result.package_endtime,
                          join_result.package_starttime,
                          join_result.student_level_code,
                          join_result.student_status_code,
                          join_result.transformed_at)))
        #
        print('join_result--2')
        join_result.printSchema()
        join_result.show(5)

        dyf_join_result = DynamicFrame.fromDF(join_result, glueContext,
                                              'dyf_join_result')

        dyf_join_result = Filter.apply(
            frame=dyf_join_result,
            f=lambda x: x["contact_id"] is not None and x["contact_id"] != '')

        apply_ouput = ApplyMapping.apply(
            frame=dyf_join_result,
            mappings=[("student_behavior_id", "string", "student_behavior_id",
                       "string"),
                      ("student_behavior_date", "long",
                       "student_behavior_date", "long"),
                      ("behavior_id", "long", "behavior_id", "long"),
                      ("student_id", "string", "student_id", "long"),
                      ("contact_id", "string", "contact_id", "string"),
                      ("package_code", "long", "package_code", "string"),
                      ("package_endtime", "long", "package_endtime", "long"),
                      ("package_starttime", "long", "package_starttime",
                       "long"),
                      ("student_level_code", "string", "student_level_code",
                       "string"),
                      ("student_status_code", "string", "student_status_code",
                       "string"),
                      ("transformed_at", "long", "transformed_at", "long")])

        dfy_output = ResolveChoice.apply(frame=apply_ouput,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice2")
        #
        glueContext.write_dynamic_frame.from_options(
            frame=dfy_output,
            connection_type="s3",
            connection_options={
                "path": "s3://dtsodin/student_behavior/student_behavior",
                "partitionKeys": ["behavior_id"]
            },
            format="parquet")
コード例 #21
0
ファイル: teste_pyspark.py プロジェクト: docarmoj/TesteESS
# Ponto de entrada para gerar uma sessão do spark
spark = SparkSession.builder.appName("PySpark Test").getOrCreate()

# ==> Lendo de um arquivo CSV(local) sem a necessidade de gerar um schema
# ==> df = spark.read.csv(r'./resources/data.csv',header=True)

# Criando um DataFrame passando os valores do array e o schema definido
df = spark.createDataFrame(data=data, schema=schema)

# Criando um novo DataFrame que irá incluir uma nova coluna,
# convertendo campo String em date
# e excluindo a coluna de data(origem em string)
df2 = df.withColumn(
    'Atualizacao',
    F.to_date(
        F.unix_timestamp(
            F.col('Data_Atualizacao'),
            'yyyy-MM-dd').cast("timestamp"))).drop('Data_Atualizacao')

# Criando o Dataframe final adcionando uma nova coluna
#  que fará o rank ordenando pela data de atualização,
#  agrupado por municipio e ordenado
df_final = df2.withColumn(
    "Ordem Transacao",
    F.dense_rank().over(
        W.partitionBy("Municipio").orderBy("Atualizacao"))).orderBy(
            "Municipio", "Ordem Transacao")

# Exibindo resultado
df_final.show(truncate=False)
コード例 #22
0
        .getOrCreate()

## creating spark and hive context
sc = spark.sparkContext
sqlctxt = HiveContext(sc)

## reading dat from hive table
data = sqlctxt.sql(
    "SELECT nav_date,fund_id,client_name,datetime,date_identifier from h011gtcsandbox.xnd_pricing_data"
)
data = data.dropna()

## changing date column to required format
data = data.withColumn(
    'nav_date',
    f.from_unixtime(f.unix_timestamp('nav_date',
                                     'MM/dd/yyy')).alias('nav_date'))
data = data.withColumn('nav_date', data.nav_date.cast(DateType()))

## changing datetime to required format
data = data.withColumn(
    'datetime', f.from_unixtime(f.unix_timestamp('datetime',
                                                 'MM/dd/yy HH:mm')))
data = data.withColumn('datetime', data.datetime.cast(TimestampType()))

data = data.filter((data.nav_date >= '2017-10-02')
                   & (data.nav_date <= '2018-04-31'))

## collecting date for partitioning
date = data.select(f.max('nav_date')).collect()[0][0]
year = date.year
month = date.month
コード例 #23
0
def test_unix_timestamp(data_gen):
    assert_gpu_and_cpu_are_equal_collect(lambda spark: unary_op_df(
        spark, data_gen).select(f.unix_timestamp(f.col('a'))))
コード例 #24
0
                by flight) order by UTC'                                        )

    adsbTable = adsbTable.dropDuplicates(['flight', 'lon', 'lat'])

    # the next set of lines will handle the scenario where the same flight number from one airport arrives at another
    # airport, then departs to another airport we do not want to treat this as one flight, rather multiple flights, for
    # example, if we have flight ACA108 that comes in from Montreal to Toronto then leaves for Winnipeg, we will have
    # ACA108-0 and ACA108-1
    #
    # if there was just one flight number, then only -0 is appended

    # partition the flight by UTC
    window_flightNum_UTC = Window.partitionBy("flight").orderBy("UTC")

    # calculate the deltaTime between the previous and current coordinate for each flight
    adsbDf=adsbTable.withColumn("deltaTime", psf.unix_timestamp('UTC') - psf.lag(psf.unix_timestamp('UTC')). \
        over(window_flightNum_UTC))

    # create a trigger when the time elapse from the previous coordinates to the next is more than 1200 seconds
    # (20 minutes) note that this isn't foolproof, if there is gaps in your coverage that is 20 minutes or longer then
    # this will break that one flight into two separate flights, adjust the 1200 seconds below to suit your
    # requirements
    adsbDf1 = adsbDf.withColumn(
        "trigger",
        psf.when(adsbDf.deltaTime > 1200, 1).otherwise(0))

    # create the suffix for the flight by using the cumulative sum of the triggers for each flight
    adsbDf2 = adsbDf1.withColumn(
        "suffix",
        psf.sum(adsbDf1.trigger).over(window_flightNum_UTC))
コード例 #25
0
def filter_chart_events(spark, orig_chrtevents_file_path,
                        admissions_csv_file_path,
                        filtered_chrtevents_outfile_path):
    #TAKES ONLY THE RELEVANT ITEM ROWS FROM THE CHARTEVENTS.CSV file
    item_mappings = get_event_key_ids()

    #use subset of large CHARTEVENTS.csv file for faster development
    chrtevents_file_path_to_use = orig_chrtevents_file_path
    use_sample_subset_lines = False
    if use_sample_subset_lines:

        chartevents_sample_temp_file = "CHARTEVENTS_SAMPLE.csv"
        chrtevents_file_path_to_use = chartevents_sample_temp_file

        temp_file = open(chartevents_sample_temp_file, "w+")
        with open(orig_chrtevents_file_path) as orig_file:
            i = 0
            for line in orig_file:
                temp_file.write(line)
                i = i + 1
                if i > 500000:
                    break
        temp_file.close()

    # LOS ***
    #*********

    los_path = os.path.join(PATH_MIMIC_ORIGINAL_CSV_FILES, "ICUSTAYS.csv")
    df_los = spark.read.csv(los_path, header=True, inferSchema="false")

    #Filter out less than one day lenght of stays
    df_los = df_los.filter(col('LOS') >= 1).select(['HADM_ID'])

    df_chartevents = spark.read.csv(chrtevents_file_path_to_use,
                                    header=True,
                                    inferSchema="false")

    filtered_chartevents = df_chartevents.filter(
        col('ITEMID').isin(list(item_mappings.keys())))
    filtered_chartevents = filtered_chartevents.withColumn(
        "ITEMNAME",
        translate(item_mappings)("ITEMID"))

    #join filtered_chartevents with ADMISSIONS.csv on HADMID --- only keep HADMID AND ADMITTIME COLUMNS FROM ADMISSIONS
    df_admissions = spark.read.csv(admissions_csv_file_path,
                                   header=True,
                                   inferSchema="false").select(
                                       'HADM_ID', 'ADMITTIME')

    #add column that contains the hour the observation occurred after admission  (0 - X)
    filtered_chartevents = filtered_chartevents.join(df_admissions,
                                                     ['HADM_ID'])
    timeFmt = "yyyy-MM-dd' 'HH:mm:ss"  #2153-09-03 07:15:00
    timeDiff = F.round(
        (F.unix_timestamp('CHARTTIME', format=timeFmt) -
         F.unix_timestamp('ADMITTIME', format=timeFmt)) / 60 / 60
    ).cast(
        'integer'
    )  #calc diff, convert seconds to minutes, minutes to hours, then math.floor to remove decimal places (for hourly bin/aggregations)
    filtered_chartevents = filtered_chartevents.withColumn(
        "HOUR_OF_OBS_AFTER_HADM", timeDiff)  #  F.round(   ).cast('integer')

    #filter out all observations where X > 48  (occurred after initial 48 hours of admission)
    filtered_chartevents = filtered_chartevents.filter(
        (col('HOUR_OF_OBS_AFTER_HADM') <= 48)
        & (col('HOUR_OF_OBS_AFTER_HADM') >= 0))

    filtered_chartevents = df_los.join(filtered_chartevents, ['HADM_ID'])

    #REMOVE columns that are not needed (keep CHARTEVENTS cols, ITEMNAME, HOUR_OF_OBS_AFTER_HADM
    filtered_chartevents = reduce(DataFrame.drop, ['ADMITTIME'],
                                  filtered_chartevents)

    with open(filtered_chrtevents_outfile_path, "w+") as f:
        w = csv.DictWriter(f, fieldnames=filtered_chartevents.schema.names)
        w.writeheader()

        for rdd_row in filtered_chartevents.rdd.toLocalIterator():
            w.writerow(rdd_row.asDict())
コード例 #26
0
def main():
    """
     - Downloads outstanding data
     - Sets up Spark environment
     - Loads data
     - Summarises data
     - Merges data
     - Prepares data for modelling
    :return: None
    """
    # --- Download data (if its not already downloaded)
    if config.MODE == 'prod':
        # In production mode we want to download all the csv files
        # Datasets in develop are controlled by the user
        # Download and save taxi journey data
        download_data(config.TAXI_DATA_URLS, config.TAXI_DATA_DIR)
        # Download and save road traffic accident data
        download_data(config.ACCIDENT_DATA_URLS, config.ACCIDENT_DATA_DIR)

    # --- Set up Spark environment
    spark = SparkSession.builder.appName('Basics').getOrCreate()

    # --- Load data
    # Load and parse taxi data, add an id column
    taxi_df = load_data(spark,
                        data_dir=config.TAXI_DATA_DIR,
                        schema=config.TAXI_DATA_SCHEMA)\
        .withColumn(config.TAXI_ID_COL, monotonically_increasing_id())

    logging.info(
        f'Selecting a random {config.SAMPLE_RATE * 100}% of data before merging'
    )
    # Get a random 1% of data with random seed=1
    splits = taxi_df.randomSplit([1 - config.SAMPLE_RATE, config.SAMPLE_RATE],
                                 seed=1)
    taxi_df = splits[1]

    # Load and parse accident data, add an id column and a timestamp column
    accident_df = load_data(spark,
                            data_dir=config.ACCIDENT_DATA_DIR,
                            schema=config.ACCIDENT_DATA_SCHEMA)\
        .withColumn(config.ACCIDENT_ID_COL, monotonically_increasing_id())\
        .withColumn('accident_timestamp', unix_timestamp('date', 'MM/dd/yyyy').cast('timestamp'))

    # --- Summarise data
    # Plot and save data summary (if its not already saved)

    # plot_summary(taxi_df, 'pickup_datetime', config.TAXI_ID_COL, config.TAXI_VOLUME_PLOT_FILE)
    # plot_summary(accident_df, 'accident_timestamp', config.ACCIDENT_ID_COL, config.ACCIDENT_VOLUME_PLOT_FILE)

    # --- Create ML features
    # Merge nearby accidents with taxi trips (this is a very long running process)
    df = merge_accidents(taxi_df, accident_df)

    # Create day of week, hour of day values from time stamp
    df = timestamps_to_features(df, 'pickup_datetime')
    df = timestamps_to_features(df, 'dropoff_datetime')

    # --- Log the results of the data preparation stages
    logging.info('Data preparation complete')
    # RDD has a countApprox() function that is quicker than .count(), unsure if the conversion from DataFrame to RDD
    # cancels out the savings
    logging.info(f'Number of rows: {df.rdd.countApprox(10)}')
    logging.info(f'Data schema: \n{df._jdf.schema().treeString()}')

    # TODO extract all this to function(s)
    # --- Train a gradient boosted trees model to predict the duration of a trip
    # Create the target variable ('label' is a special column name in MLlib
    logging.info(
        'Creating label column (based on the delta between dropoff_datetime and pickup_datetime)'
    )
    df = df.withColumn(
        'label',
        unix_timestamp(df['dropoff_datetime']) -
        unix_timestamp(df['pickup_datetime']))

    logging.info('Dropping rows that contain null (for a subset of columns)')
    # Drop any samples with a NULL value
    df = df.na.drop(
        how="any",
        subset=[config.TAXI_DATA_SCHEMA.fieldNames()].extend([
            'pickup_datetime_day_of_week', 'pickup_datetime_hour_of_day',
            'pickup_datetime_month_of_year', 'dropoff_datetime_day_of_week',
            'dropoff_datetime_hour_of_day', 'dropoff_datetime_month_of_year'
        ]))

    logging.info('Filling any remaining null values with 99999')
    # Filling na values with code 99999
    df = df.na.fill(value=99999)

    logging.info('Spliting records into training and testing sets')
    # Split the data into training and test sets (30% held out for testing)
    (training_df, testing_df) = df.randomSplit([0.7, 0.3])

    # Define an "VectorAssembler", which joins multiple columns into a single vector
    ignore = [
        'label', 'pickup_datetime', 'dropoff_datetime', config.ACCIDENT_ID_COL,
        config.TAXI_ID_COL
    ]
    assembler = VectorAssembler(
        inputCols=[col for col in df.columns if col not in ignore],
        outputCol='features')
    # Transform the data using the defined assembler
    logging.info('Transforming data using VectorAssembler')
    df = assembler.transform(df)

    # Define a "VectorIndexer" which converts categorical fields (defined by having less than 20 unique values) into
    # one-hot (?) encoded data
    feature_indexer = VectorIndexer(inputCol="features",
                                    outputCol="indexedFeatures").fit(
                                        df.select('features'))

    # Train a GBT model
    gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10)

    # Chain assembler, indexer and GBT in a Pipeline
    pipeline = Pipeline(stages=[assembler, feature_indexer, gbt])

    logging.info('Running model pipeline')
    # Train model.  This also runs the indexer.
    model_pipeline = pipeline.fit(training_df)

    logging.info('Making model predictions')
    # Make predictions
    predictions = model_pipeline.transform(testing_df)

    # Select example rows to display
    predictions.select("prediction", "label", "features").show(5)

    logging.info('Evaluating predictions')
    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    logging.info(f'Root Mean Squared Error (RMSE) on test data: {rmse}')

    # TODO extract this to a function
    # Save the outputs (predictions and model)

    import uuid

    run_id = uuid.uuid4()

    date = datetime.date.today()
    model_pipeline.save(
        f'{config.MODEL_DIR}/{run_id}/model-{config.MODE}-{date}')
    predictions.select("prediction", "label").write.csv(
        f'{config.PREDICTIONS_DIR}/{run_id}/predictions-{config.MODE}-{date}.csv'
    )

    # Print some cool stuff about the model
    gbt_model = model_pipeline.stages[2]
    logging.info(gbt_model)
    attrs = sorted((attr["idx"], attr["name"]) for attr in (chain(
        *df.schema["features"].metadata["ml_attr"]["attrs"].values())))

    for idx, name in attrs:
        if gbt_model.featureImportances[idx]:
            print(name, gbt_model.featureImportances[idx])
コード例 #27
0
ファイル: ETL_Module.py プロジェクト: zemingyu/flight
def txtToPq_v2(inputFolder, pqFolder, pqFileName, searchString = "*.txt", append = True):
    """
    Read in all txt files in a folder, convert to parquet, and either append parquet or create new parquet
	This version is compatible with some of the v1.1 files inside s3://flight.price.11
	Main difference: leg1 is renamed to leg1	
    @params:
        inputFolder   - Required  : input folder that contains json line txt files (Str)        
        pqFolder      - Required  : folder to save the parquet files into (Str)        
        pqFileName    - Required  : parquet file name (Bool)        
        append        - Optional  : append to existing parquet or create new parquet 
        searchString  - Optional  : search string that identifies all the json line text files (Str)        
    """
    
    flightv1_1 = spark.read.json(os.path.join(inputFolder, searchString))
    
    flightv1_1_2 = (flightv1_1.withColumn('trip', col('trip').cast('string'))
                            .withColumn('stayDays', correct_stay_days_UDF(col('trip'), col('stayDays')))                    
                            .withColumn('depDate', to_date('depDate'))
                            .withColumn('searchDate', to_date('searchDate'))
                            .selectExpr('*', 'date_add(depDate, stayDays) as retDate')# this is when the return trip starts, might arrive a day later
                            .withColumn('airline_code', flightv1_1.leg1.carrierSummary.airlineCodes.getItem(0))                   
                            .withColumn('airline_codes', flightv1_1.leg1.carrierSummary.airlineCodes)                    
                            .withColumn('airline_codes_leg2', flightv1_1.leg2.carrierSummary.airlineCodes)                    
                            .withColumn('departureTime', flightv1_1.leg1.departureTime.isoStr)
                            .withColumn('departureTime_leg2', flightv1_1.leg2.departureTime.isoStr)
                            .withColumn('arrivalTime', flightv1_1.leg1.arrivalTime.isoStr)
                            .withColumn('arrivalTime_leg2', flightv1_1.leg2.arrivalTime.isoStr)
        #                 .withColumn('check_bag_inc', flightv1_1.leg1.arrivalTime)
                            .withColumn('airlineName', flightv1_1.leg1.carrierSummary.airlineName)
                            .withColumn('airlineName_leg2', flightv1_1.leg2.carrierSummary.airlineName)
                            .withColumn('duration_m', (F.unix_timestamp('arrivalTime', format=timeFmt) - 
                                                       F.unix_timestamp('departureTime', format=timeFmt))/60)                    
                        .withColumn('duration_m_leg2', (F.unix_timestamp('arrivalTime_leg2', format=timeFmt) - 
                                                       F.unix_timestamp('departureTime_leg2', format=timeFmt))/60)                    
        #                     .withColumn('duration', flightv1_1.timeline1.getItem(1).duration)
                        .withColumn('airlineCode', flightv1_1.timeline1.getItem(0).carrier.airlineCode)
                        .withColumn('flightNumber', flightv1_1.timeline1.getItem(0).carrier.flightNumber.cast('string'))                
                        .select('*', F.concat(col('airlineCode'), col('flightNumber')).alias('flight_code'))
                        .drop('airlineCode', 'flightNumber')
                        .withColumn('plane', flightv1_1.timeline1.getItem(0).carrier.plane)                
                        .withColumn('stops', flightv1_1.leg1.stops.cast('byte'))                                
                        .withColumn('stops_leg2', flightv1_1.leg2.stops.cast('byte'))                

        #                 .withColumn('stop_list', flightv1_1.leg1.stop_list)# need to do more work                
                        .withColumn('stop_airport', take_all_level1_str(flightv1_1.leg1.stop_list, lit('airport')))                                               
                        .withColumn('stop_duration', take_all_level1_str(flightv1_1.leg1.stop_list, lit('duration')))                                               

        #                 .withColumn('stop_list_leg2', flightv1_1.leg2.stop_list)               
                        .withColumn('stop_airport_leg2', take_all_level1_str(flightv1_1.leg2.stop_list, lit('airport')))                                               
                        .withColumn('stop_duration_leg2', take_all_level1_str(flightv1_1.leg2.stop_list, lit('duration')))                                               


                        .withColumn('noOfTicketsLeft', correct_tickets_left_UDF(flightv1_1.leg1.carrierSummary.noOfTicketsLeft))
                        .withColumn('noOfTicketsLeft', col('noOfTicketsLeft').cast('byte'))                
                        .withColumn('noOfTicketsLeft_leg2', correct_tickets_left_UDF(flightv1_1.leg2.carrierSummary.noOfTicketsLeft))
                        .withColumn('noOfTicketsLeft_leg2', col('noOfTicketsLeft_leg2').cast('byte'))
                        .withColumn('fromCityAirportCode', flightv1_1.leg1.departureLocation.airportCode)                
                        .withColumn('toCityAirportCode', flightv1_1.leg1.arrivalLocation.airportCode)
                        .withColumn('fromCityAirportCode_leg2', flightv1_1.leg2.departureLocation.airportCode)
                        .withColumn('toCityAirportCode_leg2', flightv1_1.leg2.arrivalLocation.airportCode)

                        # carrier leg 1
                        .withColumn('carrierAirProviderId', flightv1_1.leg1.carrierSummary.airProviderId)
                        .withColumn('carrierAirlineImageFileName', flightv1_1.leg1.carrierSummary.airlineImageFileName)
                        .withColumn('carrierMixedCabinClass', flightv1_1.leg1.carrierSummary.mixedCabinClass)
                        .withColumn('carrierMultiStop', flightv1_1.leg1.carrierSummary.multiStop)
                        .withColumn('carrierNextDayArrival', flightv1_1.leg1.carrierSummary.nextDayArrival)

                        # carrier leg 2
                        .withColumn('carrierAirProviderId_leg2', flightv1_1.leg2.carrierSummary.airProviderId)
                        .withColumn('carrierAirlineImageFileName_leg2', flightv1_1.leg2.carrierSummary.airlineImageFileName)
                        .withColumn('carrierMixedCabinClass_leg2', flightv1_1.leg2.carrierSummary.mixedCabinClass)
                        .withColumn('carrierMultiStop_leg2', flightv1_1.leg2.carrierSummary.multiStop)
                        .withColumn('carrierNextDayArrival_leg2', flightv1_1.leg2.carrierSummary.nextDayArrival)

                        ### Leg 1
                        ## Leg 1 departure
        #                 .withColumn('timeline_departureAirport', take_all_airport(flightv1_1.timeline1, lit('departureAirport')))                               
                        .withColumn('timeline_departureAirport_cityState', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('airportCityState')))
                        .withColumn('timeline_departureAirport_city', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('city')))
                        .withColumn('timeline_departureAirport_code', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('code')))
                        .withColumn('timeline_departureAirport_localName', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('localName')))
                        .withColumn('timeline_departureAirport_longName', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('longName')))
                        .withColumn('timeline_departureAirport_name', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('name')))

                        .withColumn('timeline_departureTime', take_all_level2_str(flightv1_1.timeline1, lit('departureTime'), lit('isoStr')))



                        ## Leg 1 arrival
                        .withColumn('timeline_arrivalAirport_cityState', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('airportCityState')))
                        .withColumn('timeline_arrivalAirport_city', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('city')))
                        .withColumn('timeline_arrivalAirport_code', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('code')))
                        .withColumn('timeline_arrivalAirport_localName', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('localName')))
                        .withColumn('timeline_arrivalAirport_longName', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('longName')))
                        .withColumn('timeline_arrivalAirport_name', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('name')))                

                        .withColumn('timeline_arrivalTime', take_all_level2_str(flightv1_1.timeline1, lit('arrivalTime'), lit('isoStr')))

                        # distance
                        .withColumn('timeline_distance', take_all_level2_str(flightv1_1.timeline1, lit('distance'), lit('formattedTotal')))

                        # carrier
                        .withColumn('timeline_plane', take_all_level2_str(flightv1_1.timeline1, lit('carrier'), lit('plane')))

                        # brandedFareName
                        .withColumn('timeline_brandedFareName', take_all_level1_str(flightv1_1.timeline1, lit('brandedFareName')))                               

                        # type
                        .withColumn('timeline_type', take_all_level1_str(flightv1_1.timeline1, lit('type')))                               

                        ### Leg 2
                        ## Leg 2 departure
                        .withColumn('timeline_departureAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('airportCityState')))
                        .withColumn('timeline_departureAirport_city_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('city')))
                        .withColumn('timeline_departureAirport_code_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('code')))
                        .withColumn('timeline_departureAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('localName')))
                        .withColumn('timeline_departureAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('longName')))
                        .withColumn('timeline_departureAirport_name_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('name')))

                        .withColumn('timeline_departureTime_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureTime'), lit('isoStr')))                


                        ## Leg 2 arrival
                        .withColumn('timeline_arrivalAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('airportCityState')))
                        .withColumn('timeline_arrivalAirport_city_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('city')))
                        .withColumn('timeline_arrivalAirport_code_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('code')))
                        .withColumn('timeline_arrivalAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('localName')))
                        .withColumn('timeline_arrivalAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('longName')))
                        .withColumn('timeline_arrivalAirport_name_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('name')))                

                        .withColumn('timeline_arrivalTime_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalTime'), lit('isoStr')))

                        # distance
                        .withColumn('timeline_distance_leg2', take_all_level2_str(flightv1_1.timeline2, lit('distance'), lit('formattedTotal')))

                        # carrier
                        .withColumn('timeline_plane_leg2', take_all_level2_str(flightv1_1.timeline2, lit('carrier'), lit('plane')))

                        # brandedFareName
                        .withColumn('timeline_brandedFareName_leg2', take_all_level1_str(flightv1_1.timeline2, lit('brandedFareName')))                           

                        # type
                        .withColumn('timeline_type_leg2', take_all_level1_str(flightv1_1.timeline2, lit('type')))                               

                        # create variables droppped from v1.0
                        .withColumn('span_days', lit(99))
                        .withColumn('power', lit(False))
                        .withColumn('video', lit(False))
                        .withColumn('wifi', lit(False))
                        .withColumn('stop_info', col('stop_airport')) #placeholder. can't figure out how to create struct literal


                        .select('price', 'version', 'searchDate', 'tableName', 'task_id', 'currencyCode', 
                                'fromCity', 'toCity', 'trip', 'depDate', 'retDate',
                                'stayDays', 
                               'departureTime', 'arrivalTime', 'departureTime_leg2', 'arrivalTime_leg2',
                                'airlineName', 'airlineName_leg2', 'duration_m', 'duration_m_leg2',                
                                'flight_code', 'plane', 'stops', 'stops_leg2', 'stop_airport', 'stop_duration', 'stop_airport_leg2', 'stop_duration_leg2',
                                'noOfTicketsLeft', 'noOfTicketsLeft_leg2',
                               'airline_code', 'airline_codes', 'airline_codes_leg2', 
                                'fromCityAirportCode', 'toCityAirportCode', 'fromCityAirportCode_leg2', 'toCityAirportCode_leg2',
                               'carrierAirProviderId', 'carrierAirlineImageFileName', 'carrierMixedCabinClass', 'carrierMultiStop', 'carrierNextDayArrival',
                                'carrierAirProviderId_leg2', 'carrierAirlineImageFileName_leg2', 'carrierMixedCabinClass_leg2', 'carrierMultiStop_leg2', 'carrierNextDayArrival_leg2',
                                #'url',

                                ## leg 1
                                # departure
                                'timeline_departureAirport_cityState', 'timeline_departureAirport_city', 'timeline_departureAirport_code', 'timeline_departureAirport_localName', 
                                'timeline_departureAirport_longName', 'timeline_departureAirport_name',

                                'timeline_departureTime',

                                # arrival
                                'timeline_arrivalAirport_cityState', 'timeline_arrivalAirport_city', 'timeline_arrivalAirport_code', 'timeline_arrivalAirport_localName', 
                                'timeline_arrivalAirport_longName', 'timeline_arrivalAirport_name',

                                'timeline_arrivalTime',

                                'timeline_distance',
                                'timeline_plane',
                                'timeline_brandedFareName',
                                'timeline_type',

                                ## leg 2                        
                                # departure
                                'timeline_departureAirport_cityState_leg2', 'timeline_departureAirport_city_leg2', 'timeline_departureAirport_code_leg2', 'timeline_departureAirport_localName_leg2', 
                                'timeline_departureAirport_longName_leg2', 'timeline_departureAirport_name_leg2',

                                'timeline_departureTime_leg2',

                                # arrival
                                'timeline_arrivalAirport_cityState_leg2', 'timeline_arrivalAirport_city_leg2', 'timeline_arrivalAirport_code_leg2', 'timeline_arrivalAirport_localName_leg2', 
                                'timeline_arrivalAirport_longName_leg2', 'timeline_arrivalAirport_name_leg2',

                                'timeline_arrivalTime_leg2',

                                'timeline_distance_leg2',
                                'timeline_plane_leg2',
                                'timeline_brandedFareName_leg2',
                                'timeline_type_leg2',

                                # variables dropped from v1.0
                                'span_days', 'power', 'video', 'wifi', 'stop_info'
                               )                
                       )


    if append:
        flightv1_1_2.repartition(1).write.mode('append').parquet(os.path.join(pqFolder, pqFileName))        
    else:
        flightv1_1_2.repartition(1).write.parquet(os.path.join(pqFolder, pqFileName))   
コード例 #28
0
spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
spark._jsc.hadoopConfiguration().set("parquet.enable.summary-metadata", "false")

## Read in data by pointing to its's table name in Glue Data Catalog
schema = StructType() \
  .add('source', StringType()) \
  .add('type', StringType()) \
  .add('data', StringType()) \
  .add('ts', StringType())

src = spark.read.load('s3://demoanalyticsapp-output/*/*/*/*/', format='parquet', schema=schema)

## Build out some new columns with data extracted from the JSON string

df = src \
  .withColumn('n_ts', F.unix_timestamp('ts', "yyyy-MM-dd'T'hh:mm:ss").cast('timestamp')) \
  .withColumn('agerange', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].agerange'), \
      StructType().add('high', IntegerType()).add('low', IntegerType()) \
    ) \
  ) \
  .withColumn('smile', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].smile'), \
      StructType().add('confidence', DoubleType()).add('value', BooleanType()) \
    ) \
  ) \
  .withColumn('eyeglasses', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].eyeglasses'), \
コード例 #29
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--lang",
                        "-l",
                        default="enwiki",
                        type=str,
                        help="language to parse (en or enwiki)")

    parser.add_argument(
        "--start",
        "-t1",
        default=None,
        type=str,
        help=
        "start day to parse [inclusive] (YYYY-MM-DD); default: previous day - 7days"
    )

    parser.add_argument(
        "--end",
        "-t2",
        default=None,
        type=str,
        help="end day to parse [exclusive] (YYYY-MM-DD); default: current day")

    args = parser.parse_args()
    lang = args.lang.replace('wiki', '')
    wiki_db = lang + 'wiki'

    t1 = args.start
    t2 = args.end
    if t1 != None and t2 != None:
        try:
            date_start = datetime.datetime.strptime(t1, '%Y-%m-%d')
            date_end = datetime.datetime.strptime(t2, '%Y-%m-%d')
        except ValueError:
            print('Provide correct day-format YYYY-MM-DD')
    else:

        date_start = datetime.date.today() - datetime.timedelta(days=8)
        date_end = datetime.date.today()

    date_start_str = date_start.strftime('%Y-%m-%d')
    date_end_str = date_end.strftime('%Y-%m-%d')

    #### other parameters
    ## filter pageviews from actor with more than 500 pageviews
    ## the aim is to filter automated traffic that is not tagged as spider
    n_p_max = 500  ## maximum number of pageviews/user/day
    n_p_min = 1  ## minimum number of pageviews/user/day

    ## filtering sessions
    dt = 3600  ## cutoff for splitting sessions(interevent time between 2 pageivews)
    nlen_min = 2  ## min length of session
    nlen_max = 30  ## max length of session

    ## sessions will be saved locally in filename_save
    path_save = os.path.abspath('../data/%s/' % lang)
    # filename_save = '%s.reading-sessions-%s--%s'%(lang,date_start_str,date_end_str)
    filename_save = '%s.reading-sessions' % (lang)

    ## tmp-directory for data on hive (will be deleted)
    base_dir_hdfs = '/tmp/mwaddlink/sessions'

    ### start
    spark = SparkSession.builder\
        .master('yarn')\
        .appName('reading-sessions')\
        .enableHiveSupport()\
        .getOrCreate()

    ########
    ## query
    ################################################
    ## time-window
    ts_start = calendar.timegm(date_start.timetuple())
    ts_end = calendar.timegm(date_end.timetuple())
    row_timestamp = F.unix_timestamp(
        F.concat(F.col('year'), F.lit('-'), F.col('month'), F.lit('-'),
                 F.col('day'), F.lit(' '), F.col('hour'), F.lit(':00:00')))

    ## window for counting pageviews per actor per day
    w_p = Window.partitionBy(F.col('actor_signature_per_project_family'),
                             F.col('year'), F.col('month'), F.col('day'))

    ### actor table (filtered webrequests)
    ## https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Traffic/Pageview_actor
    df_actor = (
        spark.read.table('wmf.pageview_actor').where(
            row_timestamp >= ts_start).where(row_timestamp < ts_end).where(
                F.col('is_pageview') == True)
        ## agent-type user to filter spiders
        ## https://meta.wikimedia.org/wiki/Research:Page_view/Tags#Spider
        .where(F.col('agent_type') == "user")
        ## user: desktop/mobile/mobile app; isaac filters != mobile app
        .where(F.col('access_method') != "mobile app")
        ## only wikis
        .where(F.col('normalized_host.project_family') == 'wikipedia')
        ## only namespace 0
        .where(F.col('namespace_id') == 0).withColumn(
            'wiki_db', F.concat(F.col('normalized_host.project'),
                                F.lit('wiki'))))
    ## filter only specific wiki (or all if wiki_db=='wikidata')
    if wiki_db == 'wikidata':
        pass
    else:
        df_actor = df_actor.where(F.col('wiki_db') == wiki_db)

    ## checkpoint for inspecting table
    # df_actor.limit(10).write.mode('overwrite').parquet('/user/mgerlach/sessions/test.parquet')

    # filter maximum and minimum pageviews per user
    # n_p is the number of pageviews per actor per day (across projects)
    df_actor = (df_actor.withColumn(
        'n_p',
        F.sum(F.lit(1)).over(w_p)).where(F.col('n_p') >= n_p_min).where(
            F.col('n_p') <= n_p_max))

    ## join the wikidata-item to each pageview
    ## we keep only pageviews for which we have a correpsionding wikidata-item id

    ## table with mapping wikidata-ids to page-ids
    ## partition wikidb and page-id ordered by snapshot
    w_wd = Window.partitionBy(F.col('wiki_db'), F.col('page_id')).orderBy(
        F.col('snapshot').desc())
    df_wd = (
        spark.read.table('wmf.wikidata_item_page_link')
        ## snapshot: this is a partition!
        .where(
            F.col('snapshot') >=
            '2020-07-01')  ## resolve issues with non-mathcing wikidata-items
        ## only wikis (enwiki, ... not: wikisource)
        .where(F.col('wiki_db').endswith('wiki')))
    ## filter only specific wiki (or all if wiki_db=='wikidata')
    if wiki_db == 'wikidata':
        pass
    else:
        df_wd = df_wd.where(F.col('wiki_db') == wiki_db)
    ## get the most recent wikidata-item for each pid+wikidb
    df_wd = (df_wd.withColumn(
        'item_id_latest',
        F.first(F.col('item_id')).over(w_wd)).select(
            'wiki_db', 'page_id',
            F.col('item_id_latest').alias('item_id')).drop_duplicates())
    df_actor_wd = (df_actor.join(df_wd, on=['page_id', 'wiki_db'],
                                 how='inner'))

    ## aggregate all pageviews with same actor-signature across wikis to get sessions
    df_actor_wd_agg = (
        df_actor_wd.groupby('actor_signature_per_project_family').agg(
            F.first(F.col('access_method')).alias(
                'access_method'),  ## this could change along a session
            F.first(F.col('geocoded_data')).alias('geocoded_data'),
            #              F.first(F.col('n_p_by_user')).alias('session_length'),
            F.array_sort(
                F.collect_list(
                    F.struct(
                        F.col('ts'),
                        F.col('page_id'),
                        F.col('pageview_info.page_title').alias('page_title'),
                        F.col('wiki_db'),
                        F.col('item_id').alias('qid'),
                    ))).alias('session')))
    # df_actor_wd_agg.limit(10).write.mode('overwrite').parquet('/user/mgerlach/sessions/test.parquet')

    ## apply filter to the sessions
    try:
        os.mkdir(path_save)
    except FileExistsError:
        pass
    PATH_TMP = os.path.join(path_save, 'tmp')
    try:
        os.mkdir(PATH_TMP)
    except FileExistsError:
        pass

    ## hdfs-storing, some temporary files which will be deleted later
    output_hdfs_dir = os.path.join(base_dir_hdfs, filename_save)
    os.system('hadoop fs -rm -r %s' % output_hdfs_dir)
    ## local storing
    base_dir_local = path_save
    output_local_dir_tmp = os.path.join(base_dir_local, 'tmp', filename_save)
    output_local_file = os.path.join(base_dir_local, filename_save)

    ## load data
    # requests = spark.read.load(filename).rdd.map(lambda x: x['session'])
    requests = df_actor_wd_agg.rdd.map(lambda x: x['session'])
    ## keep only pageviews from a language
    requests = requests.map(lambda rs: [r for r in rs if r['page_id'] != None])
    to_str = lambda x: ' '.join([str(e['page_id']) for e in x])

    (requests.map(parse_requests).filter(
        filter_blacklist_qid)  ## remove main_page
     .filter(lambda x: len(x) >= nlen_min
             )  ## only sessions with at least length nlen_min
     .map(filter_consecutive_articles
          )  ## remove consecutive calls to same article
     .filter(lambda x: len(x) >= nlen_min
             )  ## only sessions with at least length nlen_min
     .flatMap(lambda x: sessionize(x, dt=dt)
              )  ## break sessions if interevent time is too large
     .filter(lambda x: len(x) >= nlen_min
             )  ## only sessions with at least length nlen_min
     .filter(lambda x: len(x) <= nlen_max
             )  ## only sessions with at most length nlen_max
     .map(to_str)  ## conctenate session as single string
     ## write to hdfs
     .saveAsTextFile(
         output_hdfs_dir,
         compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec"))

    ## copy to local (set of tmp-dirs)
    os.system('hadoop fs -copyToLocal %s %s' %
              (output_hdfs_dir, output_local_dir_tmp))
    ## concatenate and unzip into single file
    os.system('cat %s/* | gunzip > %s' %
              (output_local_dir_tmp, output_local_file))
    # ## remove set of tmp-dirs
    os.system('rm -rf %s' % output_local_dir_tmp)
    # ## remove hadoop data
    os.system('hadoop fs -rm -r %s' % output_hdfs_dir)

    print('Path to reading sessions: %s' % filename_save)
    return filename_save
def accident_fact_dimension():

    #Download all the required csv files from S3
    sc_2005 = spark.read.csv(
        '/home/workspace/uk-data/accidents_2005_to_2007.csv',
        header=True,
        sep=",")
    sc_2011 = spark.read.csv(
        '/home/workspace/uk-data/accidents_2009_to_2011.csv',
        header=True,
        sep=",")
    sc_2014 = spark.read.csv(
        '/home/workspace/uk-data/accidents_2012_to_2014.csv',
        header=True,
        sep=",")
    sc_LAD = spark.read.csv('/home/workspace/uk-data/LAD.csv',
                            header=True,
                            sep=",")
    sc_LAD3 = spark.read.csv('/home/workspace/uk-data/LAD3.csv',
                             header=True,
                             sep=",")
    sc_LAD4 = spark.read.csv('/home/workspace/uk-data/LAD4.csv',
                             header=True,
                             sep=",")

    #Combine the accident data
    def unionAll(*dfs):
        return reduce(DataFrame.unionAll, dfs)

    sc_accidents = unionAll(sc_2005, sc_2011, sc_2014)

    ##Convert Columns to their respective data types
    int_cols = ['Location_Easting_OSGR', 'Location_Northing_OSGR', 'Police_Force', 'Accident_Severity', \
                'Number_of_Vehicles', 'Number_of_Casualties', 'Urban_or_Rural_Area', '1st_Road_Number', '2nd_Road_Number']

    float_cols = ['Longitude', 'Latitude']

    for col_name in int_cols:
        sc_accidents = sc_accidents.withColumn(col_name,
                                               F.col(col_name).cast('int'))

    for col_name in float_cols:
        sc_accidents = sc_accidents.withColumn(col_name,
                                               F.col(col_name).cast('float'))

    #copy the dataframe into another one
    sc_uk = sc_accidents.select("*")

    #### DROP COLUMNS WITH MORE THAN 10,000 NULL VALUES #####
    ##### AND AFTER THAT REMOVE NULL VALUES FROM THE DF #####
    drop_items = []
    for col in sc_uk.columns:
        null_length = sc_uk.select(col).filter(F.col(col).isNull()).count()
        none_length = sc_uk.select(col).filter(F.col(col) == 'None').count()
        if (null_length >= 10000) or (none_length >= 10000):
            #print(col, ':', null_length, '  ', none_length)
            drop_items.append(col)

    sc_uk = sc_uk.drop(*drop_items)
    sc_uk = sc_uk.na.drop()

    #create a new column called date_time
    sc_uk = sc_uk.withColumn(
        'date_time', F.concat(F.col("Date"), F.lit(" "), F.col("Time")))

    #Drop rows if the accidents have the same time and position more than ONCE, that means it is a duplicate
    sc_uk = sc_uk.dropDuplicates(['date_time', 'Longitude', 'Latitude'])

    #### CREATE LOCATION DIMENSION TABLE ####
    #Accident Index -- as the primary key for the location table
    loc_table = sc_uk.select('Accident_Index', 'Location_Easting_OSGR',
                             'Location_Northing_OSGR', 'Longitude',
                             'Latitude').distinct()
    loc_table.show(10)

    ##### CREATE TIME DIMENSION TABLE ######
    #this requires some feature engineering
    sc_uk = sc_uk.drop(*['Date', 'Time'])
    sc_uk = sc_uk.withColumn(
        'date_time',
        F.from_unixtime(F.unix_timestamp("date_time",
                                         "dd/MM/yy HH:mm")).cast('timestamp'))
    sc_uk = sc_uk.withColumn('Day_of_Week', dayofweek('date_time'))
    sc_uk = sc_uk.withColumn('Year', year('Year'))
    sc_uk = sc_uk.withColumn('Month', month(to_date("date_time",
                                                    "MM/dd/yyyy")))
    sc_uk = sc_uk.withColumn('week_of_year', weekofyear(to_date('date_time')))
    sc_uk = sc_uk.withColumn(
        "week_of_month", date_format(to_date("date_time", "dd/MMM/yyyy"), "W"))

    time_table = sc_uk.select('date_time', 'Day_of_Week', 'week_of_year',
                              'week_of_month', 'Month',
                              'Year').dropDuplicates()
    ##Time Table
    time_table.show(10)

    ###### CREATE LOCAL AUTHORITY DIMENSION TABLE ######
    sc_uk = sc_uk.withColumnRenamed("Local_Authority_(Highway)", "LAD_Code")
    sc_LAD = sc_LAD.withColumnRenamed("Supergroup name","Geographical Area") \
    .withColumnRenamed("Code","LAD_Code")
    #join uk accident DF with the LAD DF
    sc_uk = sc_uk.join(sc_LAD, ["LAD_Code"], how='inner')
    sc_LAD2 = sc_LAD3.join(sc_LAD4, ["Sub-Group"], how='inner')
    sc_LAD2 = sc_LAD2.withColumnRenamed("ONS Code", "LAD_Code")
    sc_uk = sc_uk.join(sc_LAD2, ["LAD_Code"], how='inner')
    sc_uk = sc_uk.drop('Name')

    #LAD table
    LAD_table = sc_uk.select(
        "LAD_Code",
        F.col('Sub-Group').alias('SubGroup'),
        F.col('Highway Authority').alias('HighwayAuthority'),
        F.col('Sub-Group Description').alias('SubGroup_Description'),
        F.col('Super-Group').alias('SuperGroup'),
        F.col('Network Density Range').alias('Network_Density_Range'),
        F.col('Urban Road Range').alias('Urban_Road_Range'),
        F.col('Region/Country').alias('Region_Country'),
        F.col('Geographical Area').alias('Geographical_Area')).distinct()
    LAD_table.show(10)

    #### CREATE CONDITION DIMENSION TABLE#####
    #consist of 'Light_Conditions', 'Weather_Conditions', 'Road_Surface_Conditions'

    ##Change "Other" value in Weather Condition table to "Unknown"-
    sc_uk = sc_uk.withColumn(
        "Weather_Conditions",
        regexp_replace("Weather_Conditions", "Other", "Unknown"))

    #create a column known as conditions that consist of all the conditions
    sc_uk = sc_uk.withColumn('conditions', F.concat(F.col('Light_Conditions'), \
                                                      F.lit(" and "), F.col('Weather_Conditions'),
                                                      F.lit(" and "), F.col('Road_Surface_Conditions')))

    #Extract values from columns
    def extract_values(df, column):
        values = df.select(column).distinct().collect()
        values = [x[0] for x in values]
        return values

    values = extract_values(sc_uk, 'conditions')

    #this function will be used to create primary id for dimension tables
    def new_column(column):
        if column in values:
            return values.index(column) + 1

    udfValue = udf(new_column, Int())

    sc_uk = sc_uk.withColumn('Condition_Index', udfValue('conditions'))

    #condition table
    condition_table = sc_uk.select("Condition_Index", 'Weather_Conditions',
                                   'Light_Conditions',
                                   'Road_Surface_Conditions').distinct()

    condition_table.sort('Condition_Index').show(10)

    #### CREATE ROAD DIMENSION TABLE #####
    #'1st_Road_Class', 'Road_Type', '2nd_Road_Class'
    sc_uk = sc_uk.withColumn(
        'road_classification',
        F.concat(F.col('1st_Road_Class'), F.lit(" and "), F.col('Road_Type'),
                 F.lit(" and "), F.col('2nd_Road_Class')))
    values = extract_values(sc_uk, 'road_classification')
    udfValue = udf(new_column, Int())
    sc_uk = sc_uk.withColumn('Road_Index', udfValue('road_classification'))

    #road table
    road_table = sc_uk.select('Road_Index', '1st_Road_Class', 'Road_Type',
                              '2nd_Road_Class').distinct()
    road_table.sort('Road_Index').show(5)

    #### CREATE PEDESTRIAN DIMENSION TABLE #####
    # Pedestrian_Crossing-Human_Control, Pedestrian_Crossing-Physical_Facilities
    sc_uk = sc_uk.withColumn(
        'pedestrian',
        F.concat(F.col('Pedestrian_Crossing-Human_Control'), F.lit(" and "),
                 F.col('Pedestrian_Crossing-Physical_Facilities')))
    values = extract_values(sc_uk, 'pedestrian')
    udfValue = udf(new_column, Int())
    sc_uk = sc_uk.withColumn('Pedestrian_Index', udfValue('pedestrian'))

    #pedestrian table
    pedestrian_table = sc_uk.select(
        'Pedestrian_Index', 'Pedestrian_Crossing-Human_Control',
        'Pedestrian_Crossing-Physical_Facilities').distinct()
    pedestrian_table.sort('Pedestrian_Index').show(10)

    #drop the features used to create ids for the dimension table
    drop_list = ['conditions', 'road_classification', 'pedestrian']
    sc_uk = sc_uk.drop(*drop_list)

    ##### CREATE ACCIDENT TABLE --- FACT TABLE #####
    #ACCIDENT INDEX -- PRIMARY KEY FOR LOCATION TABLE
    #LAD_CODE -- PRIMARY KEY FOR LOCAL AUTHORITY DISTRICT TABLE
    #DATE TIME -- PRIMARY KEY FOR TIME TABLE
    #CONDITION INDEX -- PRIMARY KEY FOR CONDITIONS TABLE
    #ROAD INDEX -- PRIMARY KEY FOR ROAD TABLE
    #PEDESTRIAN INDEX -- PRIMARY KEY FOR PEDESTRIAN TABLE

    accident_fact_table = sc_uk.select(
        'Accident_Index', 'LAD_Code', 'date_time', 'Year', 'Police_Force',
        'Accident_Severity', 'Number_of_Vehicles', 'Number_of_Casualties',
        '1st_Road_Number', 'Speed_limit', '2nd_Road_Number',
        'Urban_or_Rural_Area', 'Did_Police_Officer_Attend_Scene_of_Accident',
        'Condition_Index', 'Road_Index', 'Pedestrian_Index')

    #write the tables into csv format
    loc_table.toPandas().to_csv('/home/workspace/uk-accident/loc_table.csv',
                                index=False,
                                header=None)
    time_table.toPandas().to_csv('/home/workspace/uk-accident/time_table.csv',
                                 index=False,
                                 header=None)
    LAD_table.toPandas().to_csv('/home/workspace/uk-accident/LAD_table.csv',
                                index=False,
                                header=None)
    condition_table.toPandas().to_csv(
        '/home/workspace/uk-accident/condition_table.csv',
        index=False,
        header=None)
    pedestrian_table.toPandas().to_csv(
        '/home/workspace/uk-accident/pedestrian_table.csv',
        index=False,
        header=None)
    road_table.toPandas().to_csv('/home/workspace/uk-accident/rd_table.csv',
                                 index=False,
                                 header=None)
    accident_fact_table.toPandas().to_csv(
        '/home/workspace/uk-accident/accident_fact_table.csv',
        index=False,
        header=None)
コード例 #31
0
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source

    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today_second =  long(today.strftime("%s"))
    print('today_id: ', today_second)
    #------------------------------------------------------------------------------------------------------------------#

    def getSolanBaoLuu(solan_baoluu, songay_baoluu):
        if solan_baoluu is None:
            solan_baoluu = 0
        if songay_baoluu is None:
            songay_baoluu = 0
        if solan_baoluu > songay_baoluu:
            return songay_baoluu
        return solan_baoluu

    getSolanBaoLuu = udf(getSolanBaoLuu, LongType())


    def getSoNgayBaoLuu(solan_baoluu, songay_baoluu):
        if solan_baoluu is None:
            solan_baoluu = 0
        if songay_baoluu is None:
            songay_baoluu = 0
        if songay_baoluu > solan_baoluu:
            return songay_baoluu
        return solan_baoluu

    getSoNgayBaoLuu = udf(getSoNgayBaoLuu, LongType())



    def getContactId(code, contact_id_advisor):
        if code is not None:
            return code
        return contact_id_advisor

    getContactId = udf(getContactId, StringType())


    def concaText(student_behavior_date, behavior_id, student_id, contact_id,
                package_code, package_endtime,package_starttime,
                student_level_code, student_status_code, transformed_at):
        text_concat = ""
        if student_behavior_date is not None:
            text_concat += str(student_behavior_date)
        if behavior_id is not None:
            text_concat += str(behavior_id)
        if student_id is not None:
            text_concat += str(student_id)
        if contact_id is not None:
            text_concat += str(contact_id)
        if package_code is not None:
            text_concat += str(package_code)
        if package_endtime is  not None:
            text_concat += str(package_endtime)
        if package_starttime is not None:
            text_concat += str(package_starttime)
        if student_level_code is not None:
            text_concat += str(student_level_code)
        if student_status_code is not None:
            text_concat += str(student_status_code)
        if transformed_at is not None:
            text_concat += str(transformed_at)
        return text_concat

    concaText = udf(concaText, StringType())

    # ------------------------------------------------------------------------------------------------------------------#

    #------------------------------------------------------------------------------------------------------------------#
    dyf_poss_ghinhan_hp = glueContext.create_dynamic_frame.from_catalog(database='poss',
                                                                            table_name='ghinhan_hp')

    dyf_poss_ghinhan_hp = dyf_poss_ghinhan_hp.select_fields(['_key', 'id',
                                                             'ngay_thanhtoan', 'so_tien',
                                                             'khoa_hoc_makh', 'trang_thai'])
    dyf_poss_ghinhan_hp = dyf_poss_ghinhan_hp.resolveChoice(specs=[('_key', 'cast:long')])


    try:
        df_flag = spark.read.parquet("s3a://dtsodin/flag/student_behavior/sb1_dong_tien.parquet")
        read_from_index = df_flag.collect()[0]['flag']
        print('read from index: ', read_from_index)
        dyf_poss_ghinhan_hp = Filter.apply(frame=dyf_poss_ghinhan_hp,
                                      f=lambda x: x["_key"] > read_from_index)
    except:
        print('read flag file error ')

    dyf_poss_ghinhan_hp_number = dyf_poss_ghinhan_hp.count()
    print('dyf_poss_ghinhan_hp_number: ', dyf_poss_ghinhan_hp_number)
    if dyf_poss_ghinhan_hp_number < 1:
        return

    #-------------------------------------------------------------------------------------------------------------------#
    dyf_poss_khoa_hoc = glueContext.create_dynamic_frame.from_catalog(database='poss',
                                                                        table_name='khoa_hoc')

    dyf_poss_khoa_hoc = dyf_poss_khoa_hoc.select_fields(['makh', 'mahv',  'goi_sanpham_id', 'trang_thai'])

    # -------------------------------------------------------------------------------------------------------------------#
    dyf_poss_hoc_vien = glueContext.create_dynamic_frame.from_catalog(database='poss',
                                                                      table_name='hoc_vien')

    dyf_poss_hoc_vien = dyf_poss_hoc_vien.select_fields(['mahv', 'crm_id', 'trang_thai']).rename_field(
        'mahv', 'mahv_hv')

    # -------------------------------------------------------------------------------------------------------------------#
    dyf_poss_goi_sanpham = glueContext.create_dynamic_frame.from_catalog(database='poss',
                                                                      table_name='goi_sanpham')

    dyf_poss_goi_sanpham = dyf_poss_goi_sanpham.select_fields(['ma', 'id', 'solan_baoluu',
                                                               'songay_baoluu', 'trang_thai'])

    # -------------------------------------------------------------------------------------------------------------------#

    # -------------------------------------------------------------------------------------------------------------------#
    dyf_crm_goi_contacts = glueContext.create_dynamic_frame.from_catalog(database='crm_native',
                                                                         table_name='contacts')

    # print('dyf_crm_goi_contacts::full')
    #     # dyf_crm_goi_contacts.printSchema()


    dyf_crm_goi_contacts = dyf_crm_goi_contacts.select_fields(['Code']).rename_field('Code', 'code')
    dyf_crm_goi_contacts = Filter.apply(frame=dyf_crm_goi_contacts,
                                                  f=lambda x: x["code"] is not None and x["code"] != '')
    dy_crm_goi_contacts = dyf_crm_goi_contacts.toDF()
    dy_crm_goi_contacts = dy_crm_goi_contacts.dropDuplicates()
    # print('dy_crm_goi_contacts')
    # dy_crm_goi_contacts.printSchema()

    # -------------------------------------------------------------------------------------------------------------------#

    dyf_advisor_student_contact = glueContext.create_dynamic_frame.from_catalog(database='tig_advisor',
                                                                         table_name='student_contact')

    dyf_advisor_student_contact = dyf_advisor_student_contact.select_fields(['student_id', 'contact_id'])
    dyf_advisor_student_contact = Filter.apply(frame=dyf_advisor_student_contact,
                                        f=lambda x: x["student_id"] is not None and x["student_id"] != ''
                                               and x["contact_id"] is not None and x["contact_id"] != '')\
                                        .rename_field('student_id', 'student_id_advisor')\
                                        .rename_field('contact_id', 'contact_id_advisor')

    dy_advisor_student_contact = dyf_advisor_student_contact.toDF()
    dy_advisor_student_contact = dy_advisor_student_contact.dropDuplicates(['student_id_advisor'])

    # print('dy_advisor_student_contact')
    # dy_advisor_student_contact.printSchema()

    # -------------------------------------------------------------------------------------------------------------------#

    # print('dyf_poss_ghinhan_hp')
    # dyf_poss_ghinhan_hp.printSchema()
    #
    # print('dyf_poss_khoa_hoc')
    # dyf_poss_khoa_hoc.printSchema()
    #
    # print('dyf_poss_hoc_vien')
    # dyf_poss_hoc_vien.printSchema()
    #
    # print('dyf_poss_goi_sanpham')
    # dyf_poss_goi_sanpham.printSchema()

    dy_poss_ghinhan_hp = dyf_poss_ghinhan_hp.toDF()
    dy_poss_ghinhan_hp = dy_poss_ghinhan_hp.dropDuplicates(['id'])

    dy_poss_khoa_hoc = dyf_poss_khoa_hoc.toDF()
    dy_poss_khoa_hoc = dy_poss_khoa_hoc.dropDuplicates(['makh', 'mahv'])

    dy_poss_hoc_vien = dyf_poss_hoc_vien.toDF()
    dy_poss_hoc_vien = dy_poss_hoc_vien.dropDuplicates(['mahv_hv'])


    dy_poss_goi_sanpham = dyf_poss_goi_sanpham.toDF()
    dy_poss_hoc_vien = dy_poss_hoc_vien.dropDuplicates(['crm_id'])

    poss_ghinhan_hp_number = dy_poss_ghinhan_hp.count()
    # print('poss_ghinhan_hp_number: ', poss_ghinhan_hp_number)

    if poss_ghinhan_hp_number < 1:
        return

    df_dong_tien = dy_poss_ghinhan_hp.join(dy_poss_khoa_hoc,
                                           dy_poss_ghinhan_hp.khoa_hoc_makh == dy_poss_khoa_hoc.makh, 'left')\
        .join(dy_poss_hoc_vien, dy_poss_hoc_vien.mahv_hv == dy_poss_khoa_hoc.mahv, 'left')\
        .join(dy_poss_goi_sanpham, dy_poss_goi_sanpham.id == dy_poss_khoa_hoc.goi_sanpham_id, 'left')

    df_dong_tien = df_dong_tien.select('ngay_thanhtoan', 'ma', 'crm_id',
                                       'so_tien',
                                       getSolanBaoLuu(df_dong_tien['solan_baoluu'], df_dong_tien['songay_baoluu'])
                                            .alias('solan_baoluu_t'),
                                       getSoNgayBaoLuu(df_dong_tien['solan_baoluu'], df_dong_tien['songay_baoluu'])
                                            .alias('songay_baoluu_t'))

    # print('df_dong_tien')
    # df_dong_tien.printSchema()

    #check lms_id and contact_id

    df_dong_tien_student = df_dong_tien.join(dy_crm_goi_contacts, df_dong_tien.crm_id == dy_crm_goi_contacts.code, 'left')\
        .join(dy_advisor_student_contact, df_dong_tien.crm_id == dy_advisor_student_contact.student_id_advisor, 'left')


    # print('df_dong_tien_student-----')
    # df_dong_tien_student.printSchema()

    df_dong_tien_student = df_dong_tien_student.filter(df_dong_tien_student.code.isNotNull()
                                                       | (df_dong_tien_student.contact_id_advisor.isNotNull()))

    df_dong_tien_student = df_dong_tien_student.limit(100)

    student_id_unavailable = 0L
    package_endtime_unavailable = 0L
    package_starttime_unavailable = 0L
    student_level_code_unavailable = 'UNAVAILABLE'
    student_status_code_unavailable = 'UNAVAILABLE'
    measure1_unavailable = 0
    measure2_unavailable = 0
    measure3_unavailable = 0
    measure4_unavailable = float(0.0)


    df_dong_tien_student = df_dong_tien_student.select(
        f.unix_timestamp(df_dong_tien_student.ngay_thanhtoan, 'yyyy-MM-dd').alias('student_behavior_date'),
        f.lit(1L).alias('behavior_id'),
        f.lit(student_id_unavailable).cast('long').alias('student_id'),
        getContactId(df_dong_tien_student.code, df_dong_tien_student.contact_id_advisor).alias('contact_id'),

        df_dong_tien_student.ma.alias('package_code'),
        f.lit(package_endtime_unavailable).cast('long').alias('package_endtime'),
        f.lit(package_starttime_unavailable).cast('long').alias('package_starttime'),

        f.lit(student_level_code_unavailable).cast('string').alias('student_level_code'),
        f.lit(student_status_code_unavailable).cast('string').alias('student_status_code'),

        f.lit(today_second).alias('transformed_at'),

        'so_tien',
        'solan_baoluu_t',
        'songay_baoluu_t',

        f.lit(measure4_unavailable).alias('measure4')

    )


    print('df_dong_tien_student--1')
    df_dong_tien_student.printSchema()
    df_dong_tien_student.show(1)

    df_dong_tien_student2 = df_dong_tien_student.withColumn('student_behavior_id',
                                    f.md5(concaText(df_dong_tien_student.student_behavior_date,
                                             df_dong_tien_student.behavior_id,
                                             df_dong_tien_student.student_id,
                                             df_dong_tien_student.contact_id,
                                             df_dong_tien_student.package_code,
                                             df_dong_tien_student.package_endtime,
                                             df_dong_tien_student.package_starttime,
                                             df_dong_tien_student.student_level_code,
                                             df_dong_tien_student.student_status_code,
                                             df_dong_tien_student.transformed_at)))

    print('df_dong_tien_student2--2')
    df_dong_tien_student2.printSchema()
    df_dong_tien_student2.show(5)

    dyf_dong_tien_student = DynamicFrame.fromDF(df_dong_tien_student2, glueContext, 'dyf_dong_tien_student')

    dyf_dong_tien_student = Filter.apply(frame=dyf_dong_tien_student,
                                        f=lambda x: x["contact_id"] is not None and x["contact_id"] != '')

    apply_ouput = ApplyMapping.apply(frame=dyf_dong_tien_student,
                                     mappings=[("student_behavior_id", "string", "student_behavior_id", "string"),
                                               ("student_behavior_date", "long", "student_behavior_date", "long"),
                                               ("behavior_id", "long", "behavior_id", "long"),
                                               ("student_id", "long", "student_id", "long"),
                                               ("contact_id", "string", "contact_id", "string"),

                                               ("package_code", "long", "package_code", "string"),
                                               ("package_endtime", "long", "package_endtime", "long"),
                                               ("package_starttime", "long", "package_starttime", "long"),

                                               ("student_level_code", "string", "student_level_code", "string"),
                                               ("student_status_code", "string", "student_status_code", "string"),

                                               ("transformed_at", "long", "transformed_at", "long")
                                               ])

    dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2")

    glueContext.write_dynamic_frame.from_options(
        frame=dfy_output,
        connection_type="s3",
        connection_options={"path": "s3://dtsodin/student_behavior/student_behavior",
                            "partitionKeys": ["behavior_id"]},
        format="parquet")



    apply_general = ApplyMapping.apply(frame=dyf_dong_tien_student,
                                     mappings=[("student_behavior_id", "string", "student_behavior_id", "string"),
                                               ("so_tien", "double", "measure1", "float"),
                                               ("solan_baoluu_t", "long", "measure2", "float"),
                                               ("songay_baoluu_t", "long", "measure3", "float"),
                                               ("measure4", "float", "measure4", "float"),
                                               ("behavior_id", "long", "behavior_id", "long")
                                               ])

    dfy_output2 = ResolveChoice.apply(frame=apply_general, choice="make_cols", transformation_ctx="resolvechoice2")


    print('dfy_output2::')
    dfy_output2.show(5)

    glueContext.write_dynamic_frame.from_options(
        frame=dfy_output2,
        connection_type="s3",
        connection_options={"path": "s3://dtsodin/student_behavior/student_general_behavior",
                            "partitionKeys": ["behavior_id"]},
        format="parquet")


    flag = dy_poss_ghinhan_hp.agg({"_key": "max"}).collect()[0][0]
    flag_data = [flag]
    df = spark.createDataFrame(flag_data, "long").toDF('flag')
    df.write.parquet("s3a://dtsodin/flag/student_behavior/sb1_dong_tien.parquet", mode="overwrite")
コード例 #32
0
# COMMAND ----------

dfNew.select("CallType").distinct().show(20, False)

# COMMAND ----------

display(
    dfNew.select("CallType").groupBy("CallType").count().orderBy(
        "count", ascending=False))

# COMMAND ----------

from pyspark.sql.functions import unix_timestamp

# COMMAND ----------

pattern1 = 'MM/dd/yyyy'
pattern2 = 'MM/dd/yyyy hh:mm:ss a'
to_pattern1 = 'yyyy/MM/dd'


display( dfNew.withColumn("newCol", unix_timestamp( dfNew["CallDate"], pattern1 ).cast("timestamp")  ).drop("CallDate")\
        .withColumn("newCol2", unix_timestamp( dfNew["WatchDate"], pattern1 ).cast("timestamp")  ).drop("WatchDate")\
  .withColumn("newCol3", unix_timestamp( dfNew['ReceivedDtTm'], pattern2).cast("timestamp")) )

# COMMAND ----------

dfNew.select('ReceivedDtTm').show()

# COMMAND ----------
コード例 #33
0
            "Accept NULL hypothesis - No significant difference between groups."
        )


#-----------------------------------------------------------------------------------------------------------------------

#[1] Telecommunication data ingestion
cdr = spark.read.option("delimiter", "\t")\
  .csv('inputs/telecom/november/*.txt')\
  .toDF('square_id', 'time_interval', 'country_code', 'SMS_in', 'SMS_out', 'Call_in', 'Call_out', 'internet_traffic')
cdr = cdr.withColumn(
    'time_istant',
    func.from_unixtime(cdr.time_interval / 1000, 'yyyy-MM-dd HH:mm:ss:SSS"'))
cdr = cdr.withColumn(
    'time_istant',
    func.unix_timestamp(cdr.time_istant,
                        'yyyy-MM-dd HH:mm:ss:SSS"').cast('timestamp'))
cdr = cdr.withColumn('weekday', func.dayofweek(cdr['time_istant']))
cdr = cdr.withColumn('time_istant', func.date_trunc('hour', 'time_istant'))
cdr = cdr.withColumn(
    "time_istant",
    func.from_unixtime(func.unix_timestamp(cdr.time_istant),
                       "yyyy/MM/dd HH:mm"))
cdr = cdr.withColumn('SMS_in', cdr['SMS_in'].cast(FloatType()))
cdr = cdr.withColumn('SMS_out', cdr['SMS_out'].cast(FloatType()))
cdr = cdr.withColumn('Call_in', cdr['Call_in'].cast(FloatType()))
cdr = cdr.withColumn('Call_out', cdr['Call_out'].cast(FloatType()))
cdr= cdr.groupBy(cdr.time_istant,cdr.weekday)\
  .agg(func.sum("SMS_out").alias("smsInUscita"),func.sum("SMS_in").alias("smsInEntrata"),func.sum("Call_in").alias("chiamateInEntrata"),func.sum("Call_out").alias("chiamateInUscita"),func.sum("internet_traffic").alias("internet"))
cdr = cdr.withColumn('SMS_total', cdr['smsInUscita'] + cdr['smsInEntrata'])
cdr = cdr.withColumn('Call_total',
                     cdr['chiamateInEntrata'] + cdr['chiamateInUscita'])
コード例 #34
0
def spark_data_flow():
    get_region_label_udf = fun.udf(get_region_label, tp.StringType())
    get_region_relation_label_udf = fun.udf(partial(lambda r: r, 'BELONG'),
                                            tp.StringType())

    # 城市中间数据
    prd_basic_df = spark.read.parquet("{path}/prd_basic_df/{version}".format(
        path=TMP_PATH, version=RELATION_VERSION))

    # 地域映射表
    mapping_df = spark.read.csv('{path}/{file_name}'.format(
        path=IN_PATH, file_name=FILE_NAME),
                                sep='\t',
                                header=True).dropDuplicates(['company_county'])

    # 获取地域节点的唯一ID(省、市、区各自的唯一ID)
    region_id_df = mapping_df.where(mapping_df.county.isNotNull()).select(
        'county', 'company_county').union(
            mapping_df.where(mapping_df.county.isNull()).where(
                mapping_df.city.isNotNull()).select(
                    'city', 'company_county')).union(
                        mapping_df.where(mapping_df.county.isNull()).where(
                            mapping_df.city.isNull()).where(
                                mapping_df.province.isNotNull()).select(
                                    'province',
                                    'company_county')).withColumnRenamed(
                                        'county', 'region').cache()

    # 地域公司数量分布
    raw_region_df = prd_basic_df.select('company_county').groupBy(
        'company_county').count().withColumnRenamed('count', 'company_num')

    # 各省份分布(mapping_df 后加一列,表示这个地区的公司总数)
    tmp_region_df = mapping_df.join(raw_region_df, 'company_county',
                                    'left_outer').fillna(0L).dropDuplicates(
                                        ['company_county']).cache()

    # 每个省、市、区的公司总数
    tmp_region_2_df = tmp_region_df.groupBy('province').agg({
        'company_num':
        'sum'
    }).union(tmp_region_df.groupBy('city').agg({'company_num': 'sum'})).union(
        tmp_region_df.groupBy('county').agg({'company_num': 'sum'}))

    # 输出 node(地区编码、公司总数、地区名称、节点创建时间、节点更新时间、标签)
    prd_region_node_df = tmp_region_2_df.join(
        region_id_df, tmp_region_2_df.province == region_id_df.region).select(
            region_id_df.company_county.alias('region_code:ID'),
            fun.when(tmp_region_2_df['sum(company_num)'].isNotNull(),
                     tmp_region_2_df['sum(company_num)']).otherwise(0).alias(
                         'company_num:int'), region_id_df.region.alias('name'),
            fun.unix_timestamp().alias('create_time:long'),
            fun.unix_timestamp().alias('update_time:long'),
            get_region_label_udf().alias(':LABEL')).dropDuplicates(
                ['region_code:ID']).cache()
    '''
    地域节点与其他节点的关系
    '''

    # 区-企业关系
    prd_region_edge_1_df = prd_basic_df.join(
        mapping_df.where(mapping_df.county.isNotNull()).select(
            'county', 'company_county'), 'company_county').select(
                prd_basic_df.bbd_qyxx_id.alias(':START_ID'),
                prd_basic_df.company_county.alias(':END_ID'),
                fun.unix_timestamp().alias('create_time:long'),
                get_region_relation_label_udf().alias(':TYPE'))

    # 区-市关系
    prd_region_edge_2_df = mapping_df.select(
        'city', 'county',
        'company_county').where(mapping_df.county.isNotNull()).join(
            prd_region_node_df,
            prd_region_node_df.name == mapping_df.city).select(
                mapping_df.company_county.alias(':START_ID'),
                prd_region_node_df['region_code:ID'].alias(':END_ID'),
                fun.unix_timestamp().alias('create_time:long'),
                get_region_relation_label_udf().alias(':TYPE'))

    # 市-省关系
    prd_region_edge_3_df = mapping_df.select(
        'province', 'city', 'county',
        'company_county').where(mapping_df.county.isNull()).where(
            mapping_df.city.isNotNull()).where(
                mapping_df.province.isNotNull()).join(
                    prd_region_node_df,
                    prd_region_node_df.name == mapping_df.province).select(
                        mapping_df.company_county.alias(':START_ID'),
                        prd_region_node_df['region_code:ID'].alias(':END_ID'),
                        fun.unix_timestamp().alias('create_time:long'),
                        get_region_relation_label_udf().alias(':TYPE'))

    # 事件-区域关系
    # 由于这个关系不好解析,并且玩法太多,这里暂时不管
    # 后面如果要拿来讲故事,那么可以手工构建该类关系
    # prd_region_edge_4_df = ''

    prd_region_edge_df = prd_region_edge_1_df.union(
        prd_region_edge_2_df).union(prd_region_edge_3_df)

    return prd_region_node_df, prd_region_edge_df
コード例 #35
0
                                  Flight_date=p[6],
                                  Airline=p[7],
                                  Change=int(p[8]),
                                  Price=float(p[9]),
                                  Depart_hour=p[10],
                                  Depart_from=p[11],
                                  Arrival_hour=p[12],
                                  Arrive_to=p[13]))

sqlContext = SQLContext(sc)
dt = sqlContext.createDataFrame(flights)
dt.registerTempTable("flights")

timeFmt1 = "yyyy-MM-dd"
timeFmt2 = "dd/MM/yy"
timeDiff = ((F.unix_timestamp('Flight_date', format=timeFmt2) -
             F.unix_timestamp('Scrap_date', format=timeFmt1)) /
            (24 * 3600)).cast(IntegerType())
dt = dt.withColumn("Days", timeDiff)

processed = dt

df = processed.groupBy([
    "Scrap_time", "Country_from", "Country_to", "There_or_Back", "Flight_date",
    "Airline", "Change", "Depart_hour", "Depart_from", "Arrival_hour",
    "Arrive_to"
]).agg(
    F.mean(processed.Price).alias('Mean'),
    F.stddev(processed.Price).alias('Stddev'),
    F.count('Price').alias('Count'))
コード例 #36
0
        StructField("boost_flag", IntegerType(), True),
        StructField("app_name", StringType(), True),
        StructField("site_name", StringType(), True),
        StructField("publisher_id", StringType(), True),
        StructField("publisher_name", StringType(), True),
        StructField("app_bundle", StringType(), True),
        StructField("store_url", StringType(), True),
        StructField("secure", IntegerType(), True)
    ])

    request_log_filtered=spark.read.option("sep", "|")\
        .option("header", "True")\
        .schema(schema)\
        .csv(args.requests)\
        .where(  (F.col("request_source")==4) & (F.col("consumer_id_method")==150))\
        .select(F.unix_timestamp(F.col("request_time"), "yyyy-MM-dd HH:mm:ss.SSS ZZZZZ").alias("time_stamp"),
            F.col("timezone_offset"),
            F.col("consumer_id"),
            F.col("gender"),
            F.col("birth_year"),
            F.col("latitude"),
            F.col("longitude"),
            F.col("location_type"),
            F.col("ip"),
            F.col("request_id"),
            F.col("at_home_or_away"),
            F.col("app_id"),
            F.col("app_bundle"),
            F.col("request_source")).cache()

    schema_meta_log = StructType([